##Exploratory Data Analysis
##Analysing Station_GeoLocation_Longitute_Latitude_Elevation_EPSG_4326.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Station)
## [1] 8 4
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Station)
## Rows: 8
## Columns: 4
## $ longitude <dbl> 73.0167, 80.2500, 77.2000, 80.9330, 72.8500, 77.5833, 85…
## $ Latitude <dbl> 26.3000, 13.0667, 28.5833, 26.8667, 19.1167, 12.9667, 20…
## $ Elevation <int> 217, 6, 211, 110, 8, 920, NA, NA
## $ Location_Name <chr> "Bangalore", "Chennai", "Delhi", "Lucknow", "Mumbai", "R…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(Weather_Station)
## [1] "longitude" "Latitude" "Elevation" "Location_Name"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Station)
## 'data.frame': 8 obs. of 4 variables:
## $ longitude : num 73 80.2 77.2 80.9 72.8 ...
## $ Latitude : num 26.3 13.1 28.6 26.9 19.1 ...
## $ Elevation : int 217 6 211 110 8 920 NA NA
## $ Location_Name: chr "Bangalore" "Chennai" "Delhi" "Lucknow" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Station)
## longitude Latitude Elevation Location_Name
## Min. :72.85 Min. :12.97 Min. : 6.0 Length:8
## 1st Qu.:76.15 1st Qu.:17.60 1st Qu.: 33.5 Class :character
## Median :78.92 Median :21.23 Median :160.5 Mode :character
## Mean :79.07 Mean :21.17 Mean :245.3
## 3rd Qu.:81.92 3rd Qu.:26.44 3rd Qu.:215.5
## Max. :85.83 Max. :28.58 Max. :920.0
## NA's :2
attach(Weather_Station)
## Only Elevation seems to have some missing data, lets zoom into them
Weather_Station[is.na(Elevation),]
## longitude Latitude Elevation Location_Name
## 7 85.8333 20.2500 NA Bubhneshwar
## 8 84.8833 22.2167 NA Rourkela
miss_var_summary(Weather_Station)
## # A tibble: 4 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 Elevation 2 25
## 2 longitude 0 0
## 3 Latitude 0 0
## 4 Location_Name 0 0
prop_miss(Weather_Station)
## [1] 0.0625
## Nothing special about why Bubhneshwar and Rourkela alone seems to have elevation missing
## No cleaning needed of its data
##Analysing and Performing Imputations on Bangalore_1990_2022_BangaloreCity.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Bangalore)
## [1] 11894 5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Bangalore)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 22.9, 21.7, 21.0, 20.8, 20.4, 20.4, 18.8, 20.0, 21.0, 21.2, 21.8,…
## $ tmin <dbl> 19.1, NA, 16.4, NA, 14.2, 17.1, NA, 16.6, 15.5, 15.0, 16.0, 13.2,…
## $ tmax <dbl> 28.4, 26.5, 26.5, 27.4, 26.1, 24.2, 20.5, 25.1, NA, 27.7, 28.5, N…
## $ prcp <dbl> NA, 0, 0, 0, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Bangalore)
## time tavg tmin tmax prcp
## 1 01-01-1990 22.9 19.1 28.4 NA
## 2 02-01-1990 21.7 NA 26.5 0
## 3 03-01-1990 21.0 16.4 26.5 0
## 4 04-01-1990 20.8 NA 27.4 0
## 5 05-01-1990 20.4 14.2 26.1 0
## 6 06-01-1990 20.4 17.1 24.2 NA
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Bangalore)
## time tavg tmin tmax prcp
## 11889 20-07-2022 24.9 19.8 30.8 0.0
## 11890 21-07-2022 23.7 20.5 30.8 82.5
## 11891 22-07-2022 23.2 21.1 27.9 0.0
## 11892 23-07-2022 23.1 20.9 26.7 0.0
## 11893 24-07-2022 22.8 20.0 26.7 0.3
## 11894 25-07-2022 24.1 20.2 28.5 0.5
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Bangalore)
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Bangalore)
## 'data.frame': 11894 obs. of 5 variables:
## $ time: chr "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
## $ tavg: num 22.9 21.7 21 20.8 20.4 20.4 18.8 20 21 21.2 ...
## $ tmin: num 19.1 NA 16.4 NA 14.2 17.1 NA 16.6 15.5 15 ...
## $ tmax: num 28.4 26.5 26.5 27.4 26.1 24.2 20.5 25.1 NA 27.7 ...
## $ prcp: num NA 0 0 0 0 NA NA 0 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Bangalore)
## time tavg tmin tmax
## Length:11894 Min. :17.20 Min. : 9.30 Min. :19.80
## Class :character 1st Qu.:22.30 1st Qu.:18.10 1st Qu.:27.90
## Mode :character Median :23.50 Median :19.80 Median :29.50
## Mean :23.84 Mean :19.39 Mean :29.93
## 3rd Qu.:25.20 3rd Qu.:20.80 3rd Qu.:32.00
## Max. :32.40 Max. :27.90 Max. :39.20
## NA's :70 NA's :1389 NA's :629
## prcp
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 4.414
## 3rd Qu.: 2.000
## Max. :271.300
## NA's :4620
## Lets analyse the missing data of the dataset
n_miss(Weather_Bangalore) ## Total number of missing parameters
## [1] 6708
miss_var_summary(Weather_Bangalore) ## Missingness summary
## # A tibble: 5 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 prcp 4620 38.8
## 2 tmin 1389 11.7
## 3 tmax 629 5.29
## 4 tavg 70 0.589
## 5 time 0 0
miss_var_span(Weather_Bangalore, var = prcp, span_every = 250) ## Missingness spread
## # A tibble: 48 × 6
## span_counter n_miss n_complete prop_miss prop_complete n_in_span
## <int> <int> <int> <dbl> <dbl> <int>
## 1 1 42 208 0.168 0.832 250
## 2 2 47 203 0.188 0.812 250
## 3 3 50 200 0.2 0.8 250
## 4 4 41 209 0.164 0.836 250
## 5 5 34 216 0.136 0.864 250
## 6 6 31 219 0.124 0.876 250
## 7 7 39 211 0.156 0.844 250
## 8 8 18 232 0.072 0.928 250
## 9 9 46 204 0.184 0.816 250
## 10 10 38 212 0.152 0.848 250
## # ℹ 38 more rows
miss_var_table(Weather_Bangalore)
## # A tibble: 5 × 3
## n_miss_in_var n_vars pct_vars
## <int> <int> <dbl>
## 1 0 1 20
## 2 70 1 20
## 3 629 1 20
## 4 1389 1 20
## 5 4620 1 20
vis_miss(Weather_Bangalore) ## visualise % of missing
gg_miss_upset(Weather_Bangalore) ## plot for missing data
gg_miss_fct(x = Weather_Bangalore, fct = prcp) ## Heat map of missingness
## Warning: Removed 4 rows containing missing values (`geom_tile()`).
gg_miss_span(Weather_Bangalore, var = prcp, span_every = 250) ## Visualize span of prcp missingness
## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = Weather_Bangalore, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 5 × 2
## Variable n
## <chr> <int>
## 1 time 0
## 2 tavg 0
## 3 tmin 0
## 4 tmax 0
## 5 prcp 0
##Create shadow matrix data
head(as_shadow(Weather_Bangalore))
## # A tibble: 6 × 5
## time_NA tavg_NA tmin_NA tmax_NA prcp_NA
## <fct> <fct> <fct> <fct> <fct>
## 1 !NA !NA !NA !NA NA
## 2 !NA !NA NA !NA !NA
## 3 !NA !NA !NA !NA !NA
## 4 !NA !NA NA !NA !NA
## 5 !NA !NA !NA !NA !NA
## 6 !NA !NA !NA !NA NA
#Create nabular data by binding the shadow to the data
head(bind_shadow(Weather_Bangalore, only_miss = TRUE))
## # A tibble: 6 × 9
## time tavg tmin tmax prcp tavg_NA tmin_NA tmax_NA prcp_NA
## <chr> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct> <fct>
## 1 01-01-1990 22.9 19.1 28.4 NA !NA !NA !NA NA
## 2 02-01-1990 21.7 NA 26.5 0 !NA NA !NA !NA
## 3 03-01-1990 21 16.4 26.5 0 !NA !NA !NA !NA
## 4 04-01-1990 20.8 NA 27.4 0 !NA NA !NA !NA
## 5 05-01-1990 20.4 14.2 26.1 0 !NA !NA !NA !NA
## 6 06-01-1990 20.4 17.1 24.2 NA !NA !NA !NA NA
# Lets explore the relations ship with the missing values
Weather_Bangalore %>%
bind_shadow(only_miss = TRUE) %>%
group_by(prcp_NA) %>%
summarise(tavg_mean = mean(tavg),tavg_sd = sd(tavg))
## # A tibble: 2 × 3
## prcp_NA tavg_mean tavg_sd
## <fct> <dbl> <dbl>
## 1 !NA 23.7 2.17
## 2 NA NA NA
# After adding NA, there the SD and mean has also become NA
bind_shadow(Weather_Bangalore) %>%
ggplot(aes(x = tavg,
color = prcp_NA)) +
geom_density() +
facet_wrap(~tmin_NA)
## Warning: Combining variables of class <shade> and <factor> was deprecated in ggplot2
## 3.4.0.
## ℹ Please ensure your variables are compatible before plotting (location:
## `join_keys()`)
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 70 rows containing non-finite values (`stat_density()`).
# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(Weather_Bangalore, aes(x = tavg,y = prcp)) + geom_miss_point()
ggplot(Weather_Bangalore, aes(x = tavg,y = prcp)) + geom_miss_point() +
facet_wrap(~year(dmy(time)))
# Looks like there are not too much of missing data
# We would like to impute all the missing data with value below the range by 10%
Weather_Bangalore_imp <- impute_below_all(Weather_Bangalore)
ggplot(Weather_Bangalore_imp, aes(x = tavg, y = prcp)) + geom_miss_point()
# But we need to track the imputed values as well
Weather_Bangalore_imp_track <- bind_shadow(Weather_Bangalore) %>% impute_below_all()
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `time_NA = (function (x, ...) ...`.
## Caused by warning:
## ! `fct_explicit_na()` was deprecated in forcats 1.0.0.
## ℹ Please use `fct_na_value_to_level()` instead.
## ℹ The deprecated feature was likely used in the naniar package.
## Please report the issue at <https://github.com/njtierney/naniar/issues>.
ggplot(Weather_Bangalore_imp_track, aes(x = prcp, fill = prcp_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Weather_Bangalore_imp_track, aes(x = tmin, fill = tmin_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Weather_Bangalore_imp_track, aes(x = tavg, y = prcp, color = prcp_NA)) + geom_point()
## So we can successfully imputed all the NA values here
# Now lets fix the important the critically missing parameters prcp and lm
# via linear regression mechanism in relationship with other explanatory parameters
Weather_Bangalore_imp_lm_temp <- Weather_Bangalore %>% bind_shadow() %>% impute_lm(prcp ~ tavg + tmin) %>% impute_lm(tmin ~ tavg) %>% add_label_shadow()
ggplot(Weather_Bangalore_imp_lm_temp, aes(x = tavg, y = prcp, color = any_missing)) + geom_miss_point()
##Analysing and Performing Imputations on Chennai_1990_2022_Madras.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Chennai)
## [1] 11894 5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Chennai)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 25.2, 24.9, 25.6, 25.7, 25.5, 24.7, 25.4, 25.6, 24.8, 24.7, 24.5,…
## $ tmin <dbl> 22.8, 21.7, 21.4, NA, 20.7, NA, 23.3, 22.0, 21.7, 20.7, 20.0, 18.…
## $ tmax <dbl> 28.4, 29.1, 29.8, 28.7, 28.4, 26.1, 27.0, 28.0, 28.5, 29.0, 28.8,…
## $ prcp <dbl> 0.5, 0.0, 0.0, 0.0, 0.0, 0.5, 18.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0,…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Chennai)
## time tavg tmin tmax prcp
## 1 01-01-1990 25.2 22.8 28.4 0.5
## 2 02-01-1990 24.9 21.7 29.1 0.0
## 3 03-01-1990 25.6 21.4 29.8 0.0
## 4 04-01-1990 25.7 NA 28.7 0.0
## 5 05-01-1990 25.5 20.7 28.4 0.0
## 6 06-01-1990 24.7 NA 26.1 0.5
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Chennai)
## time tavg tmin tmax prcp
## 11889 20-07-2022 28.9 26.2 33.0 9.3
## 11890 21-07-2022 28.4 24.5 32.8 21.1
## 11891 22-07-2022 27.8 24.6 32.2 22.1
## 11892 23-07-2022 27.4 24.7 32.6 18.6
## 11893 24-07-2022 27.8 25.0 33.3 9.1
## 11894 25-07-2022 28.1 25.4 32.6 2.9
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Chennai)
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Chennai)
## 'data.frame': 11894 obs. of 5 variables:
## $ time: chr "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
## $ tavg: num 25.2 24.9 25.6 25.7 25.5 24.7 25.4 25.6 24.8 24.7 ...
## $ tmin: num 22.8 21.7 21.4 NA 20.7 NA 23.3 22 21.7 20.7 ...
## $ tmax: num 28.4 29.1 29.8 28.7 28.4 26.1 27 28 28.5 29 ...
## $ prcp: num 0.5 0 0 0 0 0.5 18 0.5 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Chennai)
## time tavg tmin tmax
## Length:11894 Min. :20.90 Min. :12.00 Min. :23.80
## Class :character 1st Qu.:26.30 1st Qu.:22.60 1st Qu.:31.10
## Mode :character Median :28.70 Median :24.60 Median :34.00
## Mean :28.49 Mean :24.38 Mean :33.91
## 3rd Qu.:30.40 3rd Qu.:26.40 3rd Qu.:36.20
## Max. :36.60 Max. :31.00 Max. :44.60
## NA's :27 NA's :3084 NA's :1019
## prcp
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 6.244
## 3rd Qu.: 3.000
## Max. :344.900
## NA's :4886
sum(is.na(Weather_Chennai))
## [1] 9016
## About 9016 entries are NA
## Lets analyse the missing data of the dataset
n_miss(Weather_Chennai) ## Total number of missing parameters
## [1] 9016
miss_var_summary(Weather_Chennai) ## Missingness summary
## # A tibble: 5 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 prcp 4886 41.1
## 2 tmin 3084 25.9
## 3 tmax 1019 8.57
## 4 tavg 27 0.227
## 5 time 0 0
miss_var_span(Weather_Chennai, var = prcp, span_every = 250) ## Missingness spread
## # A tibble: 48 × 6
## span_counter n_miss n_complete prop_miss prop_complete n_in_span
## <int> <int> <int> <dbl> <dbl> <int>
## 1 1 27 223 0.108 0.892 250
## 2 2 22 228 0.088 0.912 250
## 3 3 30 220 0.12 0.88 250
## 4 4 27 223 0.108 0.892 250
## 5 5 17 233 0.068 0.932 250
## 6 6 31 219 0.124 0.876 250
## 7 7 38 212 0.152 0.848 250
## 8 8 17 233 0.068 0.932 250
## 9 9 24 226 0.096 0.904 250
## 10 10 23 227 0.092 0.908 250
## # ℹ 38 more rows
miss_var_table(Weather_Chennai)
## # A tibble: 5 × 3
## n_miss_in_var n_vars pct_vars
## <int> <int> <dbl>
## 1 0 1 20
## 2 27 1 20
## 3 1019 1 20
## 4 3084 1 20
## 5 4886 1 20
vis_miss(Weather_Chennai) ## visualise % of missing
gg_miss_upset(Weather_Chennai) ## plot for missing data
gg_miss_fct(x = Weather_Chennai, fct = prcp) ## Heat map of missingness
## Warning: Removed 4 rows containing missing values (`geom_tile()`).
gg_miss_span(Weather_Chennai, var = prcp, span_every = 250) ## Visualize span of prcp missingness
## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = Weather_Chennai, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 5 × 2
## Variable n
## <chr> <int>
## 1 time 0
## 2 tavg 0
## 3 tmin 0
## 4 tmax 0
## 5 prcp 0
##Create shadow matrix data
head(as_shadow(Weather_Chennai))
## # A tibble: 6 × 5
## time_NA tavg_NA tmin_NA tmax_NA prcp_NA
## <fct> <fct> <fct> <fct> <fct>
## 1 !NA !NA !NA !NA !NA
## 2 !NA !NA !NA !NA !NA
## 3 !NA !NA !NA !NA !NA
## 4 !NA !NA NA !NA !NA
## 5 !NA !NA !NA !NA !NA
## 6 !NA !NA NA !NA !NA
#Create nabular data by binding the shadow to the data
head(bind_shadow(Weather_Chennai, only_miss = TRUE))
## # A tibble: 6 × 9
## time tavg tmin tmax prcp tavg_NA tmin_NA tmax_NA prcp_NA
## <chr> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct> <fct>
## 1 01-01-1990 25.2 22.8 28.4 0.5 !NA !NA !NA !NA
## 2 02-01-1990 24.9 21.7 29.1 0 !NA !NA !NA !NA
## 3 03-01-1990 25.6 21.4 29.8 0 !NA !NA !NA !NA
## 4 04-01-1990 25.7 NA 28.7 0 !NA NA !NA !NA
## 5 05-01-1990 25.5 20.7 28.4 0 !NA !NA !NA !NA
## 6 06-01-1990 24.7 NA 26.1 0.5 !NA NA !NA !NA
# Lets explore the relations ship with the missing values
Weather_Chennai %>%
bind_shadow(only_miss = TRUE) %>%
group_by(prcp_NA) %>%
summarise(tavg_mean = mean(tavg),tavg_sd = sd(tavg))
## # A tibble: 2 × 3
## prcp_NA tavg_mean tavg_sd
## <fct> <dbl> <dbl>
## 1 !NA 28.2 2.48
## 2 NA NA NA
# After adding NA, there the SD and mean has also become NA
bind_shadow(Weather_Chennai) %>%
ggplot(aes(x = tavg,
color = prcp_NA)) +
geom_density() +
facet_wrap(~tmin_NA)
## Warning: Removed 27 rows containing non-finite values (`stat_density()`).
# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(Weather_Chennai, aes(x = tavg,y = prcp)) + geom_miss_point()
ggplot(Weather_Chennai, aes(x = tavg,y = prcp)) + geom_miss_point() +
facet_wrap(~year(dmy(time)))
# Looks like there are not too much of missing data
# We would like to impute all the missing data with value below the range by 10%
Weather_Chennai_imp <- impute_below_all(Weather_Chennai)
ggplot(Weather_Chennai_imp, aes(x = tavg, y = prcp)) + geom_miss_point()
# But we need to track the imputed values as well
Weather_Chennai_imp_track <- bind_shadow(Weather_Chennai) %>% impute_below_all()
ggplot(Weather_Chennai_imp_track, aes(x = prcp, fill = prcp_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Weather_Chennai_imp_track, aes(x = tmin, fill = tmin_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Weather_Chennai_imp_track, aes(x = tavg, y = prcp, color = prcp_NA)) + geom_point()
## So we can successfully imputed all the NA values here
# Now lets fix the important the critically missing parameters prcp and lm
# via linear regression mechanism in relationship with other explanatory parameters
Weather_Chennai_imp_lm_temp <- Weather_Chennai %>% bind_shadow() %>% impute_lm(prcp ~ tavg + tmin) %>% impute_lm(tmin ~ tavg) %>% add_label_shadow()
ggplot(Weather_Chennai_imp_lm_temp, aes(x = tavg, y = prcp, color = any_missing)) + geom_miss_point()
##Analysing and Performing Imputations on Delhi_NCR_1990_2022_Safdarjung.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Delhi)
## [1] 11894 5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Delhi)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 9.4, 9.3, 9.0, 10.7, 12.6, 14.9, 14.4, 10.7, 13.4, 16.6, 17.0, 17…
## $ tmin <dbl> 6.0, 5.2, 6.5, 6.0, 7.3, 8.1, 8.1, 8.5, 7.0, NA, 10.9, 9.8, 8.8, …
## $ tmax <dbl> 15.1, 14.2, 13.6, 17.5, 20.8, 22.9, 21.4, 16.6, 20.6, 22.8, 25.3,…
## $ prcp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Delhi)
## time tavg tmin tmax prcp
## 1 01-01-1990 9.4 6.0 15.1 0
## 2 02-01-1990 9.3 5.2 14.2 0
## 3 03-01-1990 9.0 6.5 13.6 0
## 4 04-01-1990 10.7 6.0 17.5 0
## 5 05-01-1990 12.6 7.3 20.8 0
## 6 06-01-1990 14.9 8.1 22.9 0
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Delhi)
## time tavg tmin tmax prcp
## 11889 20-07-2022 30.1 26.5 33.2 14.7
## 11890 21-07-2022 28.6 26.8 30.6 21.2
## 11891 22-07-2022 29.3 27.0 32.9 0.3
## 11892 23-07-2022 30.1 25.5 34.9 8.9
## 11893 24-07-2022 30.6 27.1 35.7 0.0
## 11894 25-07-2022 30.7 26.8 35.7 0.0
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Delhi)
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Delhi)
## 'data.frame': 11894 obs. of 5 variables:
## $ time: chr "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
## $ tavg: num 9.4 9.3 9 10.7 12.6 14.9 14.4 10.7 13.4 16.6 ...
## $ tmin: num 6 5.2 6.5 6 7.3 8.1 8.1 8.5 7 NA ...
## $ tmax: num 15.1 14.2 13.6 17.5 20.8 22.9 21.4 16.6 20.6 22.8 ...
## $ prcp: num 0 0 0 0 0 0 0 0 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Delhi)
## time tavg tmin tmax
## Length:11894 Min. : 6.6 Min. : 0.10 Min. : 9.80
## Class :character 1st Qu.:18.5 1st Qu.:11.80 1st Qu.:26.70
## Mode :character Median :27.0 Median :20.00 Median :33.20
## Mean :25.0 Mean :18.88 Mean :31.79
## 3rd Qu.:30.9 3rd Qu.:26.00 3rd Qu.:36.60
## Max. :39.8 Max. :34.20 Max. :48.10
## NA's :94 NA's :1536 NA's :533
## prcp
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 3.662
## 3rd Qu.: 0.500
## Max. :262.900
## NA's :6140
sum(is.na(Weather_Delhi))
## [1] 8303
## About 8303 entries are NA
## Lets analyse the missing data of the dataset
n_miss(Weather_Delhi) ## Total number of missing parameters
## [1] 8303
miss_var_summary(Weather_Delhi) ## Missingness summary
## # A tibble: 5 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 prcp 6140 51.6
## 2 tmin 1536 12.9
## 3 tmax 533 4.48
## 4 tavg 94 0.790
## 5 time 0 0
miss_var_span(Weather_Delhi, var = prcp, span_every = 250) ## Missingness spread
## # A tibble: 48 × 6
## span_counter n_miss n_complete prop_miss prop_complete n_in_span
## <int> <int> <int> <dbl> <dbl> <int>
## 1 1 37 213 0.148 0.852 250
## 2 2 61 189 0.244 0.756 250
## 3 3 24 226 0.096 0.904 250
## 4 4 101 149 0.404 0.596 250
## 5 5 18 232 0.072 0.928 250
## 6 6 21 229 0.084 0.916 250
## 7 7 39 211 0.156 0.844 250
## 8 8 13 237 0.052 0.948 250
## 9 9 20 230 0.08 0.92 250
## 10 10 27 223 0.108 0.892 250
## # ℹ 38 more rows
miss_var_table(Weather_Delhi)
## # A tibble: 5 × 3
## n_miss_in_var n_vars pct_vars
## <int> <int> <dbl>
## 1 0 1 20
## 2 94 1 20
## 3 533 1 20
## 4 1536 1 20
## 5 6140 1 20
vis_miss(Weather_Delhi) ## visualise % of missing
gg_miss_upset(Weather_Delhi) ## plot for missing data
gg_miss_fct(x = Weather_Delhi, fct = prcp) ## Heat map of missingness
## Warning: Removed 4 rows containing missing values (`geom_tile()`).
gg_miss_span(Weather_Delhi, var = prcp, span_every = 250) ## Visualize span of prcp missingness
## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = Weather_Delhi, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 5 × 2
## Variable n
## <chr> <int>
## 1 time 0
## 2 tavg 0
## 3 tmin 0
## 4 tmax 0
## 5 prcp 0
##Create shadow matrix data
head(as_shadow(Weather_Delhi))
## # A tibble: 6 × 5
## time_NA tavg_NA tmin_NA tmax_NA prcp_NA
## <fct> <fct> <fct> <fct> <fct>
## 1 !NA !NA !NA !NA !NA
## 2 !NA !NA !NA !NA !NA
## 3 !NA !NA !NA !NA !NA
## 4 !NA !NA !NA !NA !NA
## 5 !NA !NA !NA !NA !NA
## 6 !NA !NA !NA !NA !NA
#Create nabular data by binding the shadow to the data
head(bind_shadow(Weather_Delhi, only_miss = TRUE))
## # A tibble: 6 × 9
## time tavg tmin tmax prcp tavg_NA tmin_NA tmax_NA prcp_NA
## <chr> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct> <fct>
## 1 01-01-1990 9.4 6 15.1 0 !NA !NA !NA !NA
## 2 02-01-1990 9.3 5.2 14.2 0 !NA !NA !NA !NA
## 3 03-01-1990 9 6.5 13.6 0 !NA !NA !NA !NA
## 4 04-01-1990 10.7 6 17.5 0 !NA !NA !NA !NA
## 5 05-01-1990 12.6 7.3 20.8 0 !NA !NA !NA !NA
## 6 06-01-1990 14.9 8.1 22.9 0 !NA !NA !NA !NA
# Lets explore the relations ship with the missing values
Weather_Delhi %>%
bind_shadow(only_miss = TRUE) %>%
group_by(prcp_NA) %>%
summarise(tavg_mean = mean(tavg),tavg_sd = sd(tavg))
## # A tibble: 2 × 3
## prcp_NA tavg_mean tavg_sd
## <fct> <dbl> <dbl>
## 1 !NA 25.1 7.07
## 2 NA NA NA
# After adding NA, there the SD and mean has also become NA
bind_shadow(Weather_Delhi) %>%
ggplot(aes(x = tavg,
color = prcp_NA)) +
geom_density() +
facet_wrap(~tmin_NA)
## Warning: Removed 94 rows containing non-finite values (`stat_density()`).
# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(Weather_Delhi, aes(x = tavg,y = prcp)) + geom_miss_point()
ggplot(Weather_Delhi, aes(x = tavg,y = prcp)) + geom_miss_point() +
facet_wrap(~year(dmy(time)))
# Looks like there are not too much of missing data
# We would like to impute all the missing data with value below the range by 10%
Weather_Delhi_imp <- impute_below_all(Weather_Delhi)
ggplot(Weather_Delhi_imp, aes(x = tavg, y = prcp)) + geom_miss_point()
# But we need to track the imputed values as well
Weather_Delhi_imp_track <- bind_shadow(Weather_Delhi) %>% impute_below_all()
ggplot(Weather_Delhi_imp_track, aes(x = prcp, fill = prcp_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Weather_Delhi_imp_track, aes(x = tmin, fill = tmin_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Weather_Delhi_imp_track, aes(x = tavg, y = prcp, color = prcp_NA)) + geom_point()
## So we can successfully imputed all the NA values here
# Now lets fix the important the critically missing parameters prcp and lm
# via linear regression mechanism in relationship with other explanatory parameters
Weather_Delhi_imp_lm_temp <- Weather_Delhi %>% bind_shadow() %>% impute_lm(prcp ~ tavg + tmin) %>% impute_lm(tmin ~ tavg) %>% add_label_shadow()
ggplot(Weather_Delhi_imp_lm_temp, aes(x = tavg, y = prcp, color = any_missing)) + geom_miss_point()
##Analysing and Performing Imputations on Lucknow_1990_2022.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Lucknow)
## [1] 11894 5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Lucknow)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 7.2, 10.5, 10.2, 9.1, 13.5, 11.5, 14.2, 17.1, 11.1, 14.8, 12.9, 1…
## $ tmin <dbl> NA, NA, 1.8, NA, NA, 5.9, 5.4, NA, NA, 4.1, 5.1, 7.3, NA, 6.9, 9.…
## $ tmax <dbl> 18.1, 17.2, 18.6, 19.3, 23.8, 21.4, 23.6, 24.6, 24.6, 23.6, 23.6,…
## $ prcp <dbl> 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, …
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Lucknow)
## time tavg tmin tmax prcp
## 1 01-01-1990 7.2 NA 18.1 0
## 2 02-01-1990 10.5 NA 17.2 0
## 3 03-01-1990 10.2 1.8 18.6 NA
## 4 04-01-1990 9.1 NA 19.3 0
## 5 05-01-1990 13.5 NA 23.8 0
## 6 06-01-1990 11.5 5.9 21.4 0
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Lucknow)
## time tavg tmin tmax prcp
## 11889 20-07-2022 28.6 25.1 33.1 17.7
## 11890 21-07-2022 27.4 25.1 33.1 27.3
## 11891 22-07-2022 28.1 26.1 31.1 16.0
## 11892 23-07-2022 30.3 26.2 34.7 11.9
## 11893 24-07-2022 30.0 28.1 34.7 2.0
## 11894 25-07-2022 27.1 24.1 34.3 0.5
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Lucknow)
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Lucknow)
## 'data.frame': 11894 obs. of 5 variables:
## $ time: chr "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
## $ tavg: num 7.2 10.5 10.2 9.1 13.5 11.5 14.2 17.1 11.1 14.8 ...
## $ tmin: num NA NA 1.8 NA NA 5.9 5.4 NA NA 4.1 ...
## $ tmax: num 18.1 17.2 18.6 19.3 23.8 21.4 23.6 24.6 24.6 23.6 ...
## $ prcp: num 0 0 NA 0 0 0 0 0 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Lucknow)
## time tavg tmin tmax
## Length:11894 Min. : 5.70 Min. :-0.6 Min. :11.10
## Class :character 1st Qu.:19.50 1st Qu.:12.5 1st Qu.:28.10
## Mode :character Median :27.20 Median :20.5 Median :33.40
## Mean :25.22 Mean :18.8 Mean :32.49
## 3rd Qu.:30.40 3rd Qu.:25.1 3rd Qu.:36.50
## Max. :39.70 Max. :32.7 Max. :47.30
## NA's :138 NA's :3515 NA's :1553
## prcp
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 4.536
## 3rd Qu.: 1.000
## Max. :470.900
## NA's :6152
sum(is.na(Weather_Lucknow))
## [1] 11358
## About 11358 entries are NA
## Lets analyse the missing data of the dataset
n_miss(Weather_Lucknow) ## Total number of missing parameters
## [1] 11358
miss_var_summary(Weather_Lucknow) ## Missingness summary
## # A tibble: 5 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 prcp 6152 51.7
## 2 tmin 3515 29.6
## 3 tmax 1553 13.1
## 4 tavg 138 1.16
## 5 time 0 0
miss_var_span(Weather_Lucknow, var = prcp, span_every = 250) ## Missingness spread
## # A tibble: 48 × 6
## span_counter n_miss n_complete prop_miss prop_complete n_in_span
## <int> <int> <int> <dbl> <dbl> <int>
## 1 1 75 175 0.3 0.7 250
## 2 2 65 185 0.26 0.74 250
## 3 3 39 211 0.156 0.844 250
## 4 4 39 211 0.156 0.844 250
## 5 5 21 229 0.084 0.916 250
## 6 6 35 215 0.14 0.86 250
## 7 7 34 216 0.136 0.864 250
## 8 8 24 226 0.096 0.904 250
## 9 9 47 203 0.188 0.812 250
## 10 10 67 183 0.268 0.732 250
## # ℹ 38 more rows
miss_var_table(Weather_Lucknow)
## # A tibble: 5 × 3
## n_miss_in_var n_vars pct_vars
## <int> <int> <dbl>
## 1 0 1 20
## 2 138 1 20
## 3 1553 1 20
## 4 3515 1 20
## 5 6152 1 20
vis_miss(Weather_Lucknow) ## visualise % of missing
gg_miss_upset(Weather_Lucknow) ## plot for missing data
gg_miss_fct(x = Weather_Lucknow, fct = prcp) ## Heat map of missingness
## Warning: Removed 4 rows containing missing values (`geom_tile()`).
gg_miss_span(Weather_Lucknow, var = prcp, span_every = 250) ## Visualize span of prcp missingness
## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = Weather_Lucknow, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 5 × 2
## Variable n
## <chr> <int>
## 1 time 0
## 2 tavg 0
## 3 tmin 0
## 4 tmax 0
## 5 prcp 0
##Create shadow matrix data
head(as_shadow(Weather_Lucknow))
## # A tibble: 6 × 5
## time_NA tavg_NA tmin_NA tmax_NA prcp_NA
## <fct> <fct> <fct> <fct> <fct>
## 1 !NA !NA NA !NA !NA
## 2 !NA !NA NA !NA !NA
## 3 !NA !NA !NA !NA NA
## 4 !NA !NA NA !NA !NA
## 5 !NA !NA NA !NA !NA
## 6 !NA !NA !NA !NA !NA
#Create nabular data by binding the shadow to the data
head(bind_shadow(Weather_Lucknow, only_miss = TRUE))
## # A tibble: 6 × 9
## time tavg tmin tmax prcp tavg_NA tmin_NA tmax_NA prcp_NA
## <chr> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct> <fct>
## 1 01-01-1990 7.2 NA 18.1 0 !NA NA !NA !NA
## 2 02-01-1990 10.5 NA 17.2 0 !NA NA !NA !NA
## 3 03-01-1990 10.2 1.8 18.6 NA !NA !NA !NA NA
## 4 04-01-1990 9.1 NA 19.3 0 !NA NA !NA !NA
## 5 05-01-1990 13.5 NA 23.8 0 !NA NA !NA !NA
## 6 06-01-1990 11.5 5.9 21.4 0 !NA !NA !NA !NA
# Lets explore the relations ship with the missing values
Weather_Lucknow %>%
bind_shadow(only_miss = TRUE) %>%
group_by(prcp_NA) %>%
summarise(tavg_mean = mean(tavg),tavg_sd = sd(tavg))
## # A tibble: 2 × 3
## prcp_NA tavg_mean tavg_sd
## <fct> <dbl> <dbl>
## 1 !NA 25.7 6.33
## 2 NA NA NA
# After adding NA, there the SD and mean has also become NA
bind_shadow(Weather_Lucknow) %>%
ggplot(aes(x = tavg,
color = prcp_NA)) +
geom_density() +
facet_wrap(~tmin_NA)
## Warning: Removed 138 rows containing non-finite values (`stat_density()`).
# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(Weather_Lucknow, aes(x = tavg,y = prcp)) + geom_miss_point()
ggplot(Weather_Lucknow, aes(x = tavg,y = prcp)) + geom_miss_point() +
facet_wrap(~year(dmy(time)))
# Looks like there are not too much of missing data
# We would like to impute all the missing data with value below the range by 10%
Weather_Lucknow_imp <- impute_below_all(Weather_Lucknow)
ggplot(Weather_Lucknow_imp, aes(x = tavg, y = prcp)) + geom_miss_point()
# But we need to track the imputed values as well
Weather_Lucknow_imp_track <- bind_shadow(Weather_Lucknow) %>% impute_below_all()
ggplot(Weather_Lucknow_imp_track, aes(x = prcp, fill = prcp_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Weather_Lucknow_imp_track, aes(x = tmin, fill = tmin_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Weather_Lucknow_imp_track, aes(x = tavg, y = prcp, color = prcp_NA)) + geom_point()
## So we can successfully imputed all the NA values here
# Now lets fix the important the critically missing parameters prcp and lm
# via linear regression mechanism in relationship with other explanatory parameters
Weather_Lucknow_imp_lm_temp <- Weather_Lucknow %>% bind_shadow() %>% impute_lm(prcp ~ tavg + tmin) %>% impute_lm(tmin ~ tavg) %>% add_label_shadow()
ggplot(Weather_Lucknow_imp_lm_temp, aes(x = tavg, y = prcp, color = any_missing)) + geom_miss_point()
##Analysing and Performing Imputations on Mumbai_1990_2022_Santacruz.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Mumbai)
## [1] 11894 5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Mumbai)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 23.2, 22.2, 21.8, 25.4, 26.5, 25.1, 26.0, 26.6, 25.1, 26.8, 25.6,…
## $ tmin <dbl> 17.0, 16.5, 16.3, 17.9, 19.3, 19.8, 18.9, 18.8, 19.0, 19.3, 18.5,…
## $ tmax <dbl> NA, 29.9, 30.7, 31.8, 33.7, 33.5, 33.7, 34.6, 34.4, 34.7, 34.0, 3…
## $ prcp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Mumbai)
## time tavg tmin tmax prcp
## 1 01-01-1990 23.2 17.0 NA 0
## 2 02-01-1990 22.2 16.5 29.9 0
## 3 03-01-1990 21.8 16.3 30.7 0
## 4 04-01-1990 25.4 17.9 31.8 0
## 5 05-01-1990 26.5 19.3 33.7 0
## 6 06-01-1990 25.1 19.8 33.5 0
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Mumbai)
## time tavg tmin tmax prcp
## 11889 20-07-2022 27.4 25.0 30.5 11.9
## 11890 21-07-2022 27.6 25.6 30.5 10.9
## 11891 22-07-2022 28.3 26.0 30.5 3.0
## 11892 23-07-2022 28.2 25.8 31.3 5.1
## 11893 24-07-2022 28.1 25.6 30.4 7.1
## 11894 25-07-2022 28.3 25.1 30.2 7.1
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Mumbai)
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Mumbai)
## 'data.frame': 11894 obs. of 5 variables:
## $ time: chr "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
## $ tavg: num 23.2 22.2 21.8 25.4 26.5 25.1 26 26.6 25.1 26.8 ...
## $ tmin: num 17 16.5 16.3 17.9 19.3 19.8 18.9 18.8 19 19.3 ...
## $ tmax: num NA 29.9 30.7 31.8 33.7 33.5 33.7 34.6 34.4 34.7 ...
## $ prcp: num 0 0 0 0 0 0 0 0 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Mumbai)
## time tavg tmin tmax
## Length:11894 Min. :17.70 Min. : 8.50 Min. :22.30
## Class :character 1st Qu.:26.60 1st Qu.:19.80 1st Qu.:30.90
## Mode :character Median :28.10 Median :23.70 Median :32.40
## Mean :27.76 Mean :22.62 Mean :32.31
## 3rd Qu.:29.30 3rd Qu.:25.40 3rd Qu.:33.90
## Max. :33.70 Max. :30.40 Max. :41.30
## NA's :11 NA's :2454 NA's :1907
## prcp
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 10.94
## 3rd Qu.: 7.10
## Max. :461.00
## NA's :4681
sum(is.na(Weather_Mumbai))
## [1] 9053
## About 9053 entries are NA
## Lets analyse the missing data of the dataset
n_miss(Weather_Mumbai) ## Total number of missing parameters
## [1] 9053
miss_var_summary(Weather_Mumbai) ## Missingness summary
## # A tibble: 5 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 prcp 4681 39.4
## 2 tmin 2454 20.6
## 3 tmax 1907 16.0
## 4 tavg 11 0.0925
## 5 time 0 0
miss_var_span(Weather_Mumbai, var = prcp, span_every = 250) ## Missingness spread
## # A tibble: 48 × 6
## span_counter n_miss n_complete prop_miss prop_complete n_in_span
## <int> <int> <int> <dbl> <dbl> <int>
## 1 1 7 243 0.028 0.972 250
## 2 2 13 237 0.052 0.948 250
## 3 3 21 229 0.084 0.916 250
## 4 4 5 245 0.02 0.98 250
## 5 5 5 245 0.02 0.98 250
## 6 6 16 234 0.064 0.936 250
## 7 7 8 242 0.032 0.968 250
## 8 8 9 241 0.036 0.964 250
## 9 9 7 243 0.028 0.972 250
## 10 10 5 245 0.02 0.98 250
## # ℹ 38 more rows
miss_var_table(Weather_Mumbai)
## # A tibble: 5 × 3
## n_miss_in_var n_vars pct_vars
## <int> <int> <dbl>
## 1 0 1 20
## 2 11 1 20
## 3 1907 1 20
## 4 2454 1 20
## 5 4681 1 20
vis_miss(Weather_Mumbai) ## visualise % of missing
gg_miss_upset(Weather_Mumbai) ## plot for missing data
gg_miss_fct(x = Weather_Mumbai, fct = prcp) ## Heat map of missingness
## Warning: Removed 4 rows containing missing values (`geom_tile()`).
gg_miss_span(Weather_Mumbai, var = prcp, span_every = 250) ## Visualize span of prcp missingness
## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = Weather_Mumbai, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 5 × 2
## Variable n
## <chr> <int>
## 1 time 0
## 2 tavg 0
## 3 tmin 0
## 4 tmax 0
## 5 prcp 0
##Create shadow matrix data
head(as_shadow(Weather_Mumbai))
## # A tibble: 6 × 5
## time_NA tavg_NA tmin_NA tmax_NA prcp_NA
## <fct> <fct> <fct> <fct> <fct>
## 1 !NA !NA !NA NA !NA
## 2 !NA !NA !NA !NA !NA
## 3 !NA !NA !NA !NA !NA
## 4 !NA !NA !NA !NA !NA
## 5 !NA !NA !NA !NA !NA
## 6 !NA !NA !NA !NA !NA
#Create nabular data by binding the shadow to the data
head(bind_shadow(Weather_Mumbai, only_miss = TRUE))
## # A tibble: 6 × 9
## time tavg tmin tmax prcp tavg_NA tmin_NA tmax_NA prcp_NA
## <chr> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct> <fct>
## 1 01-01-1990 23.2 17 NA 0 !NA !NA NA !NA
## 2 02-01-1990 22.2 16.5 29.9 0 !NA !NA !NA !NA
## 3 03-01-1990 21.8 16.3 30.7 0 !NA !NA !NA !NA
## 4 04-01-1990 25.4 17.9 31.8 0 !NA !NA !NA !NA
## 5 05-01-1990 26.5 19.3 33.7 0 !NA !NA !NA !NA
## 6 06-01-1990 25.1 19.8 33.5 0 !NA !NA !NA !NA
# Lets explore the relations ship with the missing values
Weather_Mumbai %>%
bind_shadow(only_miss = TRUE) %>%
group_by(prcp_NA) %>%
summarise(tavg_mean = mean(tavg),tavg_sd = sd(tavg))
## # A tibble: 2 × 3
## prcp_NA tavg_mean tavg_sd
## <fct> <dbl> <dbl>
## 1 !NA 27.6 2.10
## 2 NA NA NA
# After adding NA, there the SD and mean has also become NA
bind_shadow(Weather_Mumbai) %>%
ggplot(aes(x = tavg,
color = prcp_NA)) +
geom_density() +
facet_wrap(~tmin_NA)
## Warning: Removed 11 rows containing non-finite values (`stat_density()`).
# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(Weather_Mumbai, aes(x = tavg,y = prcp)) + geom_miss_point()
ggplot(Weather_Mumbai, aes(x = tavg,y = prcp)) + geom_miss_point() +
facet_wrap(~year(dmy(time)))
# Looks like there are not too much of missing data
# We would like to impute all the missing data with value below the range by 10%
Weather_Mumbai_imp <- impute_below_all(Weather_Mumbai)
ggplot(Weather_Mumbai_imp, aes(x = tavg, y = prcp)) + geom_miss_point()
# But we need to track the imputed values as well
Weather_Mumbai_imp_track <- bind_shadow(Weather_Mumbai) %>% impute_below_all()
ggplot(Weather_Mumbai_imp_track, aes(x = prcp, fill = prcp_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Weather_Mumbai_imp_track, aes(x = tmin, fill = tmin_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Weather_Mumbai_imp_track, aes(x = tavg, y = prcp, color = prcp_NA)) + geom_point()
## So we can successfully imputed all the NA values here
# Now lets fix the important the critically missing parameters prcp and lm
# via linear regression mechanism in relationship with other explanatory parameters
Weather_Mumbai_imp_lm_temp <- Weather_Mumbai %>% bind_shadow() %>% impute_lm(prcp ~ tavg + tmin) %>% impute_lm(tmin ~ tavg) %>% add_label_shadow()
ggplot(Weather_Mumbai_imp_lm_temp, aes(x = tavg, y = prcp, color = any_missing)) + geom_miss_point()
##Analysing and Performing Imputations on Rajasthan_1990_2022_Jodhpur.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Jodhpur)
## [1] 11894 5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Jodhpur)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 22.9, 21.7, 21.0, 20.8, 20.4, 20.4, 18.8, 20.0, 21.0, 21.2, 21.8,…
## $ tmin <dbl> 19.1, NA, 16.4, NA, 14.2, 17.1, NA, 16.6, 15.5, 15.0, 16.0, 13.2,…
## $ tmax <dbl> 28.4, 26.5, 26.5, 27.4, 26.1, 24.2, 20.5, 25.1, NA, 27.7, 28.5, N…
## $ prcp <dbl> NA, 0, 0, 0, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Jodhpur)
## time tavg tmin tmax prcp
## 1 01-01-1990 22.9 19.1 28.4 NA
## 2 02-01-1990 21.7 NA 26.5 0
## 3 03-01-1990 21.0 16.4 26.5 0
## 4 04-01-1990 20.8 NA 27.4 0
## 5 05-01-1990 20.4 14.2 26.1 0
## 6 06-01-1990 20.4 17.1 24.2 NA
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Jodhpur)
## time tavg tmin tmax prcp
## 11889 20-07-2022 24.9 19.8 30.8 0.0
## 11890 21-07-2022 23.7 20.5 30.8 82.5
## 11891 22-07-2022 23.2 21.1 27.9 0.0
## 11892 23-07-2022 23.1 20.9 26.7 0.0
## 11893 24-07-2022 22.8 20.0 26.7 0.3
## 11894 25-07-2022 24.1 20.2 28.5 0.5
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Jodhpur)
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Jodhpur)
## 'data.frame': 11894 obs. of 5 variables:
## $ time: chr "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
## $ tavg: num 22.9 21.7 21 20.8 20.4 20.4 18.8 20 21 21.2 ...
## $ tmin: num 19.1 NA 16.4 NA 14.2 17.1 NA 16.6 15.5 15 ...
## $ tmax: num 28.4 26.5 26.5 27.4 26.1 24.2 20.5 25.1 NA 27.7 ...
## $ prcp: num NA 0 0 0 0 NA NA 0 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Jodhpur)
## time tavg tmin tmax
## Length:11894 Min. :17.20 Min. : 9.30 Min. :19.80
## Class :character 1st Qu.:22.30 1st Qu.:18.10 1st Qu.:27.90
## Mode :character Median :23.50 Median :19.80 Median :29.50
## Mean :23.84 Mean :19.39 Mean :29.93
## 3rd Qu.:25.20 3rd Qu.:20.80 3rd Qu.:32.00
## Max. :32.40 Max. :27.90 Max. :39.20
## NA's :70 NA's :1389 NA's :629
## prcp
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 4.414
## 3rd Qu.: 2.000
## Max. :271.300
## NA's :4620
sum(is.na(Weather_Jodhpur))
## [1] 6708
## About 6708 entries are NA
## Lets analyse the missing data of the dataset
n_miss(Weather_Jodhpur) ## Total number of missing parameters
## [1] 6708
miss_var_summary(Weather_Jodhpur) ## Missingness summary
## # A tibble: 5 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 prcp 4620 38.8
## 2 tmin 1389 11.7
## 3 tmax 629 5.29
## 4 tavg 70 0.589
## 5 time 0 0
miss_var_span(Weather_Jodhpur, var = prcp, span_every = 250) ## Missingness spread
## # A tibble: 48 × 6
## span_counter n_miss n_complete prop_miss prop_complete n_in_span
## <int> <int> <int> <dbl> <dbl> <int>
## 1 1 42 208 0.168 0.832 250
## 2 2 47 203 0.188 0.812 250
## 3 3 50 200 0.2 0.8 250
## 4 4 41 209 0.164 0.836 250
## 5 5 34 216 0.136 0.864 250
## 6 6 31 219 0.124 0.876 250
## 7 7 39 211 0.156 0.844 250
## 8 8 18 232 0.072 0.928 250
## 9 9 46 204 0.184 0.816 250
## 10 10 38 212 0.152 0.848 250
## # ℹ 38 more rows
miss_var_table(Weather_Jodhpur)
## # A tibble: 5 × 3
## n_miss_in_var n_vars pct_vars
## <int> <int> <dbl>
## 1 0 1 20
## 2 70 1 20
## 3 629 1 20
## 4 1389 1 20
## 5 4620 1 20
vis_miss(Weather_Jodhpur) ## visualise % of missing
gg_miss_upset(Weather_Jodhpur) ## plot for missing data
gg_miss_fct(x = Weather_Jodhpur, fct = prcp) ## Heat map of missingness
## Warning: Removed 4 rows containing missing values (`geom_tile()`).
gg_miss_span(Weather_Jodhpur, var = prcp, span_every = 250) ## Visualize span of prcp missingness
## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = Weather_Jodhpur, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 5 × 2
## Variable n
## <chr> <int>
## 1 time 0
## 2 tavg 0
## 3 tmin 0
## 4 tmax 0
## 5 prcp 0
##Create shadow matrix data
head(as_shadow(Weather_Jodhpur))
## # A tibble: 6 × 5
## time_NA tavg_NA tmin_NA tmax_NA prcp_NA
## <fct> <fct> <fct> <fct> <fct>
## 1 !NA !NA !NA !NA NA
## 2 !NA !NA NA !NA !NA
## 3 !NA !NA !NA !NA !NA
## 4 !NA !NA NA !NA !NA
## 5 !NA !NA !NA !NA !NA
## 6 !NA !NA !NA !NA NA
#Create nabular data by binding the shadow to the data
head(bind_shadow(Weather_Jodhpur, only_miss = TRUE))
## # A tibble: 6 × 9
## time tavg tmin tmax prcp tavg_NA tmin_NA tmax_NA prcp_NA
## <chr> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct> <fct>
## 1 01-01-1990 22.9 19.1 28.4 NA !NA !NA !NA NA
## 2 02-01-1990 21.7 NA 26.5 0 !NA NA !NA !NA
## 3 03-01-1990 21 16.4 26.5 0 !NA !NA !NA !NA
## 4 04-01-1990 20.8 NA 27.4 0 !NA NA !NA !NA
## 5 05-01-1990 20.4 14.2 26.1 0 !NA !NA !NA !NA
## 6 06-01-1990 20.4 17.1 24.2 NA !NA !NA !NA NA
# Lets explore the relations ship with the missing values
Weather_Jodhpur %>%
bind_shadow(only_miss = TRUE) %>%
group_by(prcp_NA) %>%
summarise(tavg_mean = mean(tavg),tavg_sd = sd(tavg))
## # A tibble: 2 × 3
## prcp_NA tavg_mean tavg_sd
## <fct> <dbl> <dbl>
## 1 !NA 23.7 2.17
## 2 NA NA NA
# After adding NA, there the SD and mean has also become NA
bind_shadow(Weather_Jodhpur) %>%
ggplot(aes(x = tavg,
color = prcp_NA)) +
geom_density() +
facet_wrap(~tmin_NA)
## Warning: Removed 70 rows containing non-finite values (`stat_density()`).
# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(Weather_Jodhpur, aes(x = tavg,y = prcp)) + geom_miss_point()
ggplot(Weather_Jodhpur, aes(x = tavg,y = prcp)) + geom_miss_point() +
facet_wrap(~year(dmy(time)))
# Looks like there are not too much of missing data
# We would like to impute all the missing data with value below the range by 10%
Weather_Jodhpur_imp <- impute_below_all(Weather_Jodhpur)
ggplot(Weather_Jodhpur_imp, aes(x = tavg, y = prcp)) + geom_miss_point()
# But we need to track the imputed values as well
Weather_Jodhpur_imp_track <- bind_shadow(Weather_Jodhpur) %>% impute_below_all()
ggplot(Weather_Jodhpur_imp_track, aes(x = prcp, fill = prcp_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Weather_Jodhpur_imp_track, aes(x = tmin, fill = tmin_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Weather_Jodhpur_imp_track, aes(x = tavg, y = prcp, color = prcp_NA)) + geom_point()
## So we can successfully imputed all the NA values here
# Now lets fix the important the critically missing parameters prcp and lm
# via linear regression mechanism in relationship with other explanatory parameters
Weather_Jodhpur_imp_lm_temp <- Weather_Jodhpur %>% bind_shadow() %>% impute_lm(prcp ~ tavg + tmin) %>% impute_lm(tmin ~ tavg) %>% add_label_shadow()
ggplot(Weather_Jodhpur_imp_lm_temp, aes(x = tavg, y = prcp, color = any_missing)) + geom_miss_point()
##Analysing weather_Bhubhneshwar_1990_2022.csv")
## Have a look at the data
#definitely has more columns than the cities that we have seen so far
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Bhubhneshwar)
## [1] 11935 11
#OK, so we have 11 columns, 6 more than others
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Bhubhneshwar)
## Rows: 11,935
## Columns: 11
## $ time <chr> "1990-01-01", "1990-01-02", "1990-01-03", "1990-01-04", "1990-01-…
## $ tavg <dbl> 20.1, 20.7, 20.7, 18.8, 19.8, 22.2, 20.8, 20.3, 22.3, 21.6, 21.7,…
## $ tmin <dbl> NA, 16.4, 16.0, NA, 11.0, 12.5, NA, 13.6, 14.8, 14.5, 15.6, 12.8,…
## $ tmax <dbl> 28.0, NA, 27.4, 28.0, 28.2, NA, NA, 29.5, 31.6, 30.8, 30.7, 29.3,…
## $ prcp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, NA, 0, 0, 0, …
## $ snow <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ wdir <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ wspd <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ wpgt <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ pres <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ tsun <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Bhubhneshwar)
## time tavg tmin tmax prcp snow wdir wspd wpgt pres tsun
## 1 1990-01-01 20.1 NA 28.0 0 NA NA NA NA NA NA
## 2 1990-01-02 20.7 16.4 NA 0 NA NA NA NA NA NA
## 3 1990-01-03 20.7 16.0 27.4 0 NA NA NA NA NA NA
## 4 1990-01-04 18.8 NA 28.0 0 NA NA NA NA NA NA
## 5 1990-01-05 19.8 11.0 28.2 0 NA NA NA NA NA NA
## 6 1990-01-06 22.2 12.5 NA 0 NA NA NA NA NA NA
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Bhubhneshwar)
## time tavg tmin tmax prcp snow wdir wspd wpgt pres tsun
## 11930 2022-08-30 30.0 27.0 34.0 1.2 NA 169 8.3 NA 1007.6 NA
## 11931 2022-08-31 29.2 26.3 33.0 9.0 NA 186 8.2 NA 1006.6 NA
## 11932 2022-09-01 29.6 27.0 33.0 2.1 NA 190 9.5 NA 1006.8 NA
## 11933 2022-09-02 29.7 26.3 33.0 3.3 NA 198 9.5 NA 1007.3 NA
## 11934 2022-09-03 29.2 26.1 34.0 9.7 NA 215 8.5 NA 1005.5 NA
## 11935 2022-09-04 27.6 25.9 31.6 12.8 NA 214 8.6 NA 1004.9 NA
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(Weather_Bhubhneshwar)
## [1] "time" "tavg" "tmin" "tmax" "prcp" "snow" "wdir" "wspd" "wpgt" "pres"
## [11] "tsun"
## So the additional columns are: snow, wind direction, wind speed, wind pgt, pressure and tsunami
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Bhubhneshwar)
## 'data.frame': 11935 obs. of 11 variables:
## $ time: chr "1990-01-01" "1990-01-02" "1990-01-03" "1990-01-04" ...
## $ tavg: num 20.1 20.7 20.7 18.8 19.8 22.2 20.8 20.3 22.3 21.6 ...
## $ tmin: num NA 16.4 16 NA 11 12.5 NA 13.6 14.8 14.5 ...
## $ tmax: num 28 NA 27.4 28 28.2 NA NA 29.5 31.6 30.8 ...
## $ prcp: num 0 0 0 0 0 0 0 0 0 0 ...
## $ snow: logi NA NA NA NA NA NA ...
## $ wdir: num NA NA NA NA NA NA NA NA NA NA ...
## $ wspd: num NA NA NA NA NA NA NA NA NA NA ...
## $ wpgt: logi NA NA NA NA NA NA ...
## $ pres: num NA NA NA NA NA NA NA NA NA NA ...
## $ tsun: logi NA NA NA NA NA NA ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Bhubhneshwar)
## time tavg tmin tmax
## Length:11935 Min. :15.70 Min. : 8.20 Min. :19.4
## Class :character 1st Qu.:24.70 1st Qu.:19.00 1st Qu.:30.4
## Mode :character Median :27.70 Median :24.00 Median :32.8
## Mean :26.99 Mean :22.24 Mean :33.0
## 3rd Qu.:29.40 3rd Qu.:25.60 3rd Qu.:35.4
## Max. :37.40 Max. :31.80 Max. :46.7
## NA's :78 NA's :2090 NA's :891
## prcp snow wdir wspd
## Min. : 0.000 Mode:logical Min. : 0.0 Min. : 0.500
## 1st Qu.: 0.000 NA's:11935 1st Qu.: 89.0 1st Qu.: 4.500
## Median : 0.000 Median :188.0 Median : 7.000
## Mean : 7.074 Mean :169.1 Mean : 8.399
## 3rd Qu.: 4.100 3rd Qu.:220.8 3rd Qu.:11.000
## Max. :470.900 Max. :359.0 Max. :33.100
## NA's :5097 NA's :10641 NA's :9806
## wpgt pres tsun
## Mode:logical Min. : 990.6 Mode:logical
## NA's:11935 1st Qu.:1002.9 NA's:11935
## Median :1007.3
## Mean :1007.4
## 3rd Qu.:1012.4
## Max. :1019.3
## NA's :10692
sum(is.na(Weather_Bhubhneshwar))
## [1] 75100
## About 75100 entries are NA
## Lets analyse the missing data of the dataset
n_miss(Weather_Bhubhneshwar) ## Total number of missing parameters
## [1] 75100
miss_var_summary(Weather_Bhubhneshwar) ## Missingness summary
## # A tibble: 11 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 snow 11935 100
## 2 wpgt 11935 100
## 3 tsun 11935 100
## 4 pres 10692 89.6
## 5 wdir 10641 89.2
## 6 wspd 9806 82.2
## 7 prcp 5097 42.7
## 8 tmin 2090 17.5
## 9 tmax 891 7.47
## 10 tavg 78 0.654
## 11 time 0 0
miss_var_span(Weather_Bhubhneshwar, var = prcp, span_every = 250) ## Missingness spread
## # A tibble: 48 × 6
## span_counter n_miss n_complete prop_miss prop_complete n_in_span
## <int> <int> <int> <dbl> <dbl> <int>
## 1 1 48 202 0.192 0.808 250
## 2 2 39 211 0.156 0.844 250
## 3 3 30 220 0.12 0.88 250
## 4 4 36 214 0.144 0.856 250
## 5 5 33 217 0.132 0.868 250
## 6 6 27 223 0.108 0.892 250
## 7 7 32 218 0.128 0.872 250
## 8 8 28 222 0.112 0.888 250
## 9 9 36 214 0.144 0.856 250
## 10 10 34 216 0.136 0.864 250
## # ℹ 38 more rows
miss_var_table(Weather_Bhubhneshwar)
## # A tibble: 9 × 3
## n_miss_in_var n_vars pct_vars
## <int> <int> <dbl>
## 1 0 1 9.09
## 2 78 1 9.09
## 3 891 1 9.09
## 4 2090 1 9.09
## 5 5097 1 9.09
## 6 9806 1 9.09
## 7 10641 1 9.09
## 8 10692 1 9.09
## 9 11935 3 27.3
vis_miss(Weather_Bhubhneshwar) ## visualise % of missing
gg_miss_upset(Weather_Bhubhneshwar) ## plot for missing data
gg_miss_fct(x = Weather_Bhubhneshwar, fct = prcp) ## Heat map of missingness
## Warning: Removed 10 rows containing missing values (`geom_tile()`).
gg_miss_span(Weather_Bhubhneshwar, var = prcp, span_every = 250) ## Visualize span of prcp missingness
## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = Weather_Bhubhneshwar, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 11 × 2
## Variable n
## <chr> <int>
## 1 time 0
## 2 tavg 0
## 3 tmin 0
## 4 tmax 0
## 5 prcp 0
## 6 snow 0
## 7 wdir 0
## 8 wspd 0
## 9 wpgt 0
## 10 pres 0
## 11 tsun 0
##Create shadow matrix data
head(as_shadow(Weather_Bhubhneshwar))
## # A tibble: 6 × 11
## time_NA tavg_NA tmin_NA tmax_NA prcp_NA snow_NA wdir_NA wspd_NA wpgt_NA
## <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 !NA !NA NA !NA !NA NA NA NA NA
## 2 !NA !NA !NA NA !NA NA NA NA NA
## 3 !NA !NA !NA !NA !NA NA NA NA NA
## 4 !NA !NA NA !NA !NA NA NA NA NA
## 5 !NA !NA !NA !NA !NA NA NA NA NA
## 6 !NA !NA !NA NA !NA NA NA NA NA
## # ℹ 2 more variables: pres_NA <fct>, tsun_NA <fct>
#Create nabular data by binding the shadow to the data
head(bind_shadow(Weather_Bhubhneshwar, only_miss = TRUE))
## # A tibble: 6 × 21
## time tavg tmin tmax prcp snow wdir wspd wpgt pres tsun tavg_NA
## <chr> <dbl> <dbl> <dbl> <dbl> <lgl> <dbl> <dbl> <lgl> <dbl> <lgl> <fct>
## 1 1990-01-01 20.1 NA 28 0 NA NA NA NA NA NA !NA
## 2 1990-01-02 20.7 16.4 NA 0 NA NA NA NA NA NA !NA
## 3 1990-01-03 20.7 16 27.4 0 NA NA NA NA NA NA !NA
## 4 1990-01-04 18.8 NA 28 0 NA NA NA NA NA NA !NA
## 5 1990-01-05 19.8 11 28.2 0 NA NA NA NA NA NA !NA
## 6 1990-01-06 22.2 12.5 NA 0 NA NA NA NA NA NA !NA
## # ℹ 9 more variables: tmin_NA <fct>, tmax_NA <fct>, prcp_NA <fct>,
## # snow_NA <fct>, wdir_NA <fct>, wspd_NA <fct>, wpgt_NA <fct>, pres_NA <fct>,
## # tsun_NA <fct>
# Lets explore the relations ship with the missing values
Weather_Bhubhneshwar %>%
bind_shadow(only_miss = TRUE) %>%
group_by(prcp_NA) %>%
summarise(tavg_mean = mean(tavg),tavg_sd = sd(tavg))
## # A tibble: 2 × 3
## prcp_NA tavg_mean tavg_sd
## <fct> <dbl> <dbl>
## 1 !NA 27.2 3.29
## 2 NA NA NA
# After adding NA, there the SD and mean has also become NA
bind_shadow(Weather_Bhubhneshwar) %>%
ggplot(aes(x = tavg,
color = prcp_NA)) +
geom_density() +
facet_wrap(~tmin_NA)
## Warning: Removed 78 rows containing non-finite values (`stat_density()`).
# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(Weather_Bhubhneshwar, aes(x = tavg,y = prcp)) + geom_miss_point()
ggplot(Weather_Bhubhneshwar, aes(x = tavg,y = prcp)) + geom_miss_point() +
facet_wrap(~year(dmy(time)))
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
# Looks like there are not too much of missing data
##Analysing weather_Rourkela_2021_2022.csv")
## Have a look at the data
#definitely has more columns than the cities that we have seen so far
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Rourkela)
## [1] 426 11
#OK, so we have 11 columns, 6 more than others
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Rourkela)
## Rows: 426
## Columns: 11
## $ time <chr> "2021-07-06", "2021-07-07", "2021-07-08", "2021-07-09", "2021-07-…
## $ tavg <dbl> 29.3, 29.7, 27.4, 28.5, 29.0, 29.3, 28.9, 28.6, 29.0, 29.5, 29.6,…
## $ tmin <dbl> 26.2, 27.3, 25.8, 26.1, 26.2, 26.2, 25.7, 25.5, 25.4, 25.5, 26.3,…
## $ tmax <dbl> 32.6, 33.4, 29.7, 32.1, 32.6, 33.7, 32.9, 32.5, 32.7, 33.4, 33.2,…
## $ prcp <dbl> NA, 11.1, 66.9, 11.4, 2.7, 10.8, 5.4, 10.1, 1.9, 1.3, 1.1, 6.0, 8…
## $ snow <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ wdir <dbl> 197, 199, 186, 173, 121, 70, 95, 101, 138, 152, 179, 181, 181, 19…
## $ wspd <dbl> 6.8, 6.9, 6.3, 3.9, 4.6, 5.8, 7.0, 5.5, 6.5, 8.7, 9.5, 8.3, 8.0, …
## $ wpgt <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ pres <dbl> 1002.5, 1002.2, 1001.8, 1001.0, 1000.9, 1002.2, 1003.4, 1002.8, 1…
## $ tsun <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Rourkela)
## time tavg tmin tmax prcp snow wdir wspd wpgt pres tsun
## 1 2021-07-06 29.3 26.2 32.6 NA NA 197 6.8 NA 1002.5 NA
## 2 2021-07-07 29.7 27.3 33.4 11.1 NA 199 6.9 NA 1002.2 NA
## 3 2021-07-08 27.4 25.8 29.7 66.9 NA 186 6.3 NA 1001.8 NA
## 4 2021-07-09 28.5 26.1 32.1 11.4 NA 173 3.9 NA 1001.0 NA
## 5 2021-07-10 29.0 26.2 32.6 2.7 NA 121 4.6 NA 1000.9 NA
## 6 2021-07-11 29.3 26.2 33.7 10.8 NA 70 5.8 NA 1002.2 NA
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Rourkela)
## time tavg tmin tmax prcp snow wdir wspd wpgt pres tsun
## 421 2022-08-30 29.8 26.4 34.3 0.0 NA 174 7.6 NA 1007.9 NA
## 422 2022-08-31 29.0 26.6 33.5 2.0 NA 187 8.6 NA 1006.8 NA
## 423 2022-09-01 29.1 25.7 33.2 11.5 NA 205 6.7 NA 1007.2 NA
## 424 2022-09-02 29.4 26.4 33.7 1.5 NA 189 7.0 NA 1007.5 NA
## 425 2022-09-03 28.7 26.6 32.6 8.0 NA 203 8.0 NA 1005.8 NA
## 426 2022-09-04 28.2 25.9 31.8 17.7 NA 211 6.8 NA 1004.8 NA
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(Weather_Rourkela)
## [1] "time" "tavg" "tmin" "tmax" "prcp" "snow" "wdir" "wspd" "wpgt" "pres"
## [11] "tsun"
## So the additional columns are: snow, wind direction, wind speed, wind pgt, pressure and tsunami
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Rourkela)
## 'data.frame': 426 obs. of 11 variables:
## $ time: chr "2021-07-06" "2021-07-07" "2021-07-08" "2021-07-09" ...
## $ tavg: num 29.3 29.7 27.4 28.5 29 29.3 28.9 28.6 29 29.5 ...
## $ tmin: num 26.2 27.3 25.8 26.1 26.2 26.2 25.7 25.5 25.4 25.5 ...
## $ tmax: num 32.6 33.4 29.7 32.1 32.6 33.7 32.9 32.5 32.7 33.4 ...
## $ prcp: num NA 11.1 66.9 11.4 2.7 10.8 5.4 10.1 1.9 1.3 ...
## $ snow: logi NA NA NA NA NA NA ...
## $ wdir: num 197 199 186 173 121 70 95 101 138 152 ...
## $ wspd: num 6.8 6.9 6.3 3.9 4.6 5.8 7 5.5 6.5 8.7 ...
## $ wpgt: logi NA NA NA NA NA NA ...
## $ pres: num 1002 1002 1002 1001 1001 ...
## $ tsun: logi NA NA NA NA NA NA ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Rourkela)
## time tavg tmin tmax
## Length:426 Min. :14.60 Min. : 8.20 Min. :21.50
## Class :character 1st Qu.:24.40 1st Qu.:18.18 1st Qu.:29.60
## Mode :character Median :28.10 Median :25.20 Median :32.10
## Mean :26.71 Mean :22.30 Mean :32.25
## 3rd Qu.:29.30 3rd Qu.:26.10 3rd Qu.:33.80
## Max. :35.00 Max. :29.30 Max. :43.60
## NA's :2 NA's :2 NA's :2
## prcp snow wdir wspd
## Min. : 0.000 Mode:logical Min. : 0.0 Min. : 2.900
## 1st Qu.: 0.000 NA's:426 1st Qu.: 49.0 1st Qu.: 5.500
## Median : 0.200 Median :168.0 Median : 6.600
## Mean : 5.695 Mean :140.3 Mean : 7.441
## 3rd Qu.: 7.200 3rd Qu.:195.2 3rd Qu.: 8.725
## Max. :123.000 Max. :359.0 Max. :20.400
## NA's :3 NA's :2 NA's :2
## wpgt pres tsun
## Mode:logical Min. : 993.1 Mode:logical
## NA's:426 1st Qu.:1002.5 NA's:426
## Median :1005.5
## Mean :1006.8
## 3rd Qu.:1012.1
## Max. :1020.6
## NA's :2
sum(is.na(Weather_Rourkela))
## [1] 1293
## About 1293 entries are NA
## Lets analyse the missing data of the dataset
n_miss(Weather_Rourkela) ## Total number of missing parameters
## [1] 1293
miss_var_summary(Weather_Rourkela) ## Missingness summary
## # A tibble: 11 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 snow 426 100
## 2 wpgt 426 100
## 3 tsun 426 100
## 4 prcp 3 0.704
## 5 tavg 2 0.469
## 6 tmin 2 0.469
## 7 tmax 2 0.469
## 8 wdir 2 0.469
## 9 wspd 2 0.469
## 10 pres 2 0.469
## 11 time 0 0
miss_var_span(Weather_Rourkela, var = prcp, span_every = 250) ## Missingness spread
## # A tibble: 2 × 6
## span_counter n_miss n_complete prop_miss prop_complete n_in_span
## <int> <int> <int> <dbl> <dbl> <int>
## 1 1 1 249 0.004 0.996 250
## 2 2 2 174 0.0114 0.989 176
miss_var_table(Weather_Rourkela)
## # A tibble: 4 × 3
## n_miss_in_var n_vars pct_vars
## <int> <int> <dbl>
## 1 0 1 9.09
## 2 2 6 54.5
## 3 3 1 9.09
## 4 426 3 27.3
vis_miss(Weather_Rourkela) ## visualise % of missing
gg_miss_upset(Weather_Rourkela) ## plot for missing data
gg_miss_fct(x = Weather_Rourkela, fct = prcp) ## Heat map of missingness
## Warning: Removed 10 rows containing missing values (`geom_tile()`).
gg_miss_span(Weather_Rourkela, var = prcp, span_every = 250) ## Visualize span of prcp missingness
## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
Weather_Rourkela
## time tavg tmin tmax prcp snow wdir wspd wpgt pres tsun
## 1 2021-07-06 29.3 26.2 32.6 NA NA 197 6.8 NA 1002.5 NA
## 2 2021-07-07 29.7 27.3 33.4 11.1 NA 199 6.9 NA 1002.2 NA
## 3 2021-07-08 27.4 25.8 29.7 66.9 NA 186 6.3 NA 1001.8 NA
## 4 2021-07-09 28.5 26.1 32.1 11.4 NA 173 3.9 NA 1001.0 NA
## 5 2021-07-10 29.0 26.2 32.6 2.7 NA 121 4.6 NA 1000.9 NA
## 6 2021-07-11 29.3 26.2 33.7 10.8 NA 70 5.8 NA 1002.2 NA
## 7 2021-07-12 28.9 25.7 32.9 5.4 NA 95 7.0 NA 1003.4 NA
## 8 2021-07-13 28.6 25.5 32.5 10.1 NA 101 5.5 NA 1002.8 NA
## 9 2021-07-14 29.0 25.4 32.7 1.9 NA 138 6.5 NA 1002.7 NA
## 10 2021-07-15 29.5 25.5 33.4 1.3 NA 152 8.7 NA 1004.0 NA
## 11 2021-07-16 29.6 26.3 33.2 1.1 NA 179 9.5 NA 1006.1 NA
## 12 2021-07-17 29.8 27.0 32.9 6.0 NA 181 8.3 NA 1004.9 NA
## 13 2021-07-18 29.6 27.4 32.3 8.9 NA 181 8.0 NA 1002.8 NA
## 14 2021-07-19 29.0 27.1 31.5 12.1 NA 194 6.6 NA 1001.2 NA
## 15 2021-07-20 28.7 26.8 31.9 28.7 NA 254 5.6 NA 1001.1 NA
## 16 2021-07-21 28.4 26.2 31.7 10.1 NA 230 5.1 NA 998.5 NA
## 17 2021-07-22 28.8 25.5 32.8 17.3 NA 55 6.9 NA 995.8 NA
## 18 2021-07-23 27.5 26.5 29.0 48.3 NA 5 8.9 NA 994.6 NA
## 19 2021-07-24 28.5 26.2 32.3 12.4 NA 203 9.2 NA 996.8 NA
## 20 2021-07-25 29.4 26.0 33.0 4.7 NA 203 9.4 NA 999.7 NA
## 21 2021-07-26 29.5 26.9 33.0 1.8 NA 237 11.0 NA 999.5 NA
## 22 2021-07-27 27.9 26.5 30.5 20.9 NA 244 12.8 NA 998.3 NA
## 23 2021-07-28 27.1 26.0 29.2 9.9 NA 257 15.5 NA 998.5 NA
## 24 2021-07-29 27.4 25.8 30.6 6.4 NA 260 13.9 NA 999.0 NA
## 25 2021-07-30 26.9 25.8 28.7 17.1 NA 250 17.4 NA 998.7 NA
## 26 2021-07-31 27.1 25.2 30.7 16.0 NA 198 14.1 NA 999.6 NA
## 27 2021-08-01 28.6 25.7 32.7 2.6 NA 208 10.5 NA 1000.4 NA
## 28 2021-08-02 28.6 25.1 33.1 0.8 NA 224 9.0 NA 1001.6 NA
## 29 2021-08-03 28.5 25.7 31.8 0.4 NA 243 9.9 NA 1002.8 NA
## 30 2021-08-04 27.7 26.2 30.4 13.1 NA 221 5.1 NA 1003.4 NA
## 31 2021-08-05 28.3 26.3 31.5 8.2 NA 210 5.5 NA 1001.4 NA
## 32 2021-08-06 28.2 25.8 32.2 19.3 NA 201 5.5 NA 1001.2 NA
## 33 2021-08-07 28.5 26.5 31.9 17.5 NA 201 6.8 NA 1003.4 NA
## 34 2021-08-08 28.2 26.2 31.5 11.2 NA 193 6.5 NA 1005.2 NA
## 35 2021-08-09 29.1 26.4 32.6 10.6 NA 202 8.3 NA 1005.0 NA
## 36 2021-08-10 29.3 26.8 33.2 9.5 NA 223 9.1 NA 1002.9 NA
## 37 2021-08-11 28.9 26.4 32.0 13.8 NA 233 8.1 NA 1002.9 NA
## 38 2021-08-12 29.0 26.0 32.6 3.5 NA 199 6.2 NA 1004.7 NA
## 39 2021-08-13 29.6 26.7 33.2 1.0 NA 204 5.5 NA 1003.6 NA
## 40 2021-08-14 29.7 26.6 33.9 13.2 NA 209 5.1 NA 1001.8 NA
## 41 2021-08-15 28.8 26.1 33.2 17.7 NA 331 4.6 NA 1002.5 NA
## 42 2021-08-16 29.0 26.4 33.1 13.4 NA 193 3.7 NA 1004.1 NA
## 43 2021-08-17 29.1 26.5 32.9 9.8 NA 56 5.4 NA 1003.4 NA
## 44 2021-08-18 27.6 26.3 29.8 6.4 NA 171 7.9 NA 1002.1 NA
## 45 2021-08-19 28.0 25.9 31.4 9.0 NA 176 9.0 NA 1002.5 NA
## 46 2021-08-20 27.7 25.4 31.3 7.4 NA 169 9.4 NA 1004.8 NA
## 47 2021-08-21 28.5 25.9 31.9 3.8 NA 176 11.6 NA 1007.3 NA
## 48 2021-08-22 28.6 25.9 32.4 3.2 NA 192 7.3 NA 1007.1 NA
## 49 2021-08-23 29.2 26.5 33.3 5.0 NA 197 9.5 NA 1005.8 NA
## 50 2021-08-24 29.0 26.3 32.3 9.0 NA 216 6.2 NA 1004.9 NA
## 51 2021-08-25 29.2 26.7 33.4 20.9 NA 210 6.6 NA 1002.6 NA
## 52 2021-08-26 28.3 26.8 31.8 5.7 NA 217 5.6 NA 1002.0 NA
## 53 2021-08-27 27.6 26.2 29.5 4.2 NA 9 3.4 NA 1003.2 NA
## 54 2021-08-28 28.7 26.1 33.0 9.7 NA 58 7.2 NA 1001.3 NA
## 55 2021-08-29 28.4 25.7 32.8 8.3 NA 77 5.7 NA 1001.7 NA
## 56 2021-08-30 28.4 25.2 32.2 5.4 NA 111 7.1 NA 1004.6 NA
## 57 2021-08-31 28.4 25.6 32.6 6.7 NA 137 7.3 NA 1007.0 NA
## 58 2021-09-01 28.8 24.8 32.9 0.8 NA 164 8.1 NA 1007.5 NA
## 59 2021-09-02 28.8 26.0 33.3 5.0 NA 176 5.5 NA 1006.3 NA
## 60 2021-09-03 28.3 25.7 31.9 18.5 NA 132 3.9 NA 1004.7 NA
## 61 2021-09-04 28.6 25.7 32.7 10.6 NA 184 3.5 NA 1003.6 NA
## 62 2021-09-05 29.1 25.9 33.4 3.6 NA 76 4.4 NA 1003.2 NA
## 63 2021-09-06 28.4 26.2 32.0 8.7 NA 46 13.5 NA 1002.9 NA
## 64 2021-09-07 28.8 26.3 32.0 3.4 NA 107 10.4 NA 1003.7 NA
## 65 2021-09-08 27.8 25.7 29.9 4.9 NA 141 5.3 NA 1005.4 NA
## 66 2021-09-09 27.9 25.8 30.6 8.0 NA 176 4.2 NA 1005.2 NA
## 67 2021-09-10 28.2 25.7 31.3 4.9 NA 72 3.2 NA 1004.2 NA
## 68 2021-09-11 27.0 25.3 31.2 38.7 NA 48 7.3 NA 1002.6 NA
## 69 2021-09-12 28.2 25.7 32.5 4.3 NA 35 11.2 NA 999.7 NA
## 70 2021-09-13 27.1 26.3 29.0 48.6 NA 33 20.4 NA 994.2 NA
## 71 2021-09-14 27.2 26.1 30.3 113.4 NA 162 19.0 NA 998.9 NA
## 72 2021-09-15 27.4 25.5 30.3 16.7 NA 165 12.1 NA 1004.2 NA
## 73 2021-09-16 27.9 25.3 31.4 8.4 NA 152 5.6 NA 1005.0 NA
## 74 2021-09-17 27.9 25.1 31.6 14.9 NA 181 5.5 NA 1004.7 NA
## 75 2021-09-18 28.5 25.5 32.2 1.6 NA 198 5.6 NA 1006.4 NA
## 76 2021-09-19 27.3 25.6 30.2 7.4 NA 191 5.2 NA 1008.0 NA
## 77 2021-09-20 27.4 25.2 30.8 6.6 NA 187 2.9 NA 1006.6 NA
## 78 2021-09-21 27.0 25.0 30.4 11.6 NA 337 4.6 NA 1004.3 NA
## 79 2021-09-22 26.8 24.9 29.6 13.1 NA 225 6.9 NA 1003.0 NA
## 80 2021-09-23 27.2 25.1 30.0 10.4 NA 137 5.3 NA 1005.5 NA
## 81 2021-09-24 28.2 23.8 32.8 0.2 NA 140 5.9 NA 1006.9 NA
## 82 2021-09-25 28.7 25.1 32.6 0.5 NA 110 4.5 NA 1005.6 NA
## 83 2021-09-26 28.6 26.1 33.0 1.5 NA 49 11.9 NA 1003.8 NA
## 84 2021-09-27 27.9 25.6 31.9 4.5 NA 72 9.6 NA 1004.2 NA
## 85 2021-09-28 27.2 24.8 30.8 11.2 NA 67 5.2 NA 1006.3 NA
## 86 2021-09-29 27.2 25.2 30.4 14.9 NA 226 5.5 NA 1006.6 NA
## 87 2021-09-30 26.8 25.0 29.9 17.9 NA 184 10.2 NA 1008.2 NA
## 88 2021-10-01 27.5 24.8 31.5 17.2 NA 187 8.3 NA 1008.5 NA
## 89 2021-10-02 28.3 25.9 31.8 12.7 NA 210 7.2 NA 1008.4 NA
## 90 2021-10-03 28.3 25.3 32.4 11.8 NA 201 6.3 NA 1007.4 NA
## 91 2021-10-04 28.4 24.8 32.2 6.0 NA 191 4.8 NA 1007.0 NA
## 92 2021-10-05 28.1 25.4 32.3 1.2 NA 127 3.5 NA 1008.5 NA
## 93 2021-10-06 28.5 25.2 32.9 0.8 NA 177 4.1 NA 1007.8 NA
## 94 2021-10-07 29.1 26.0 32.8 0.9 NA 207 5.6 NA 1006.4 NA
## 95 2021-10-08 29.0 25.8 33.4 0.8 NA 191 4.6 NA 1005.6 NA
## 96 2021-10-09 28.8 24.9 33.2 0.0 NA 52 4.8 NA 1005.8 NA
## 97 2021-10-10 27.8 23.0 32.9 0.0 NA 355 4.5 NA 1006.5 NA
## 98 2021-10-11 27.7 22.5 33.0 0.0 NA 23 5.2 NA 1006.0 NA
## 99 2021-10-12 28.3 23.5 33.1 0.0 NA 0 5.0 NA 1005.5 NA
## 100 2021-10-13 28.9 24.2 33.8 0.0 NA 14 5.8 NA 1004.8 NA
## 101 2021-10-14 28.1 24.2 32.8 1.8 NA 31 7.7 NA 1004.7 NA
## 102 2021-10-15 28.7 25.8 33.8 1.2 NA 76 6.9 NA 1003.4 NA
## 103 2021-10-16 27.7 24.8 32.8 5.6 NA 47 6.7 NA 1004.4 NA
## 104 2021-10-17 26.9 25.0 31.5 20.0 NA 53 6.9 NA 1005.3 NA
## 105 2021-10-18 26.2 24.2 28.7 20.9 NA 202 6.1 NA 1004.7 NA
## 106 2021-10-19 26.9 25.0 30.2 15.3 NA 182 10.4 NA 1007.9 NA
## 107 2021-10-20 27.0 24.6 30.9 0.8 NA 237 6.3 NA 1010.6 NA
## 108 2021-10-21 26.0 21.9 31.0 0.0 NA 253 5.2 NA 1011.1 NA
## 109 2021-10-22 25.0 19.8 31.6 0.0 NA 210 4.5 NA 1012.0 NA
## 110 2021-10-23 24.8 18.7 31.5 0.0 NA 25 3.9 NA 1013.5 NA
## 111 2021-10-24 24.7 19.2 30.8 0.0 NA 25 4.4 NA 1013.9 NA
## 112 2021-10-25 24.5 18.6 30.5 0.0 NA 12 5.3 NA 1012.5 NA
## 113 2021-10-26 24.3 18.8 30.2 0.0 NA 16 6.3 NA 1011.8 NA
## 114 2021-10-27 25.2 20.4 30.3 0.0 NA 37 8.6 NA 1012.8 NA
## 115 2021-10-28 25.3 20.9 30.4 0.0 NA 40 9.1 NA 1013.3 NA
## 116 2021-10-29 24.9 20.4 30.5 0.0 NA 42 8.3 NA 1014.6 NA
## 117 2021-10-30 25.1 20.1 30.8 0.0 NA 43 7.1 NA 1014.5 NA
## 118 2021-10-31 25.1 20.3 30.9 0.0 NA 13 6.7 NA 1013.8 NA
## 119 2021-11-01 24.5 19.7 30.3 0.0 NA 358 4.3 NA 1013.8 NA
## 120 2021-11-02 25.1 19.9 30.1 0.0 NA 68 5.6 NA 1014.4 NA
## 121 2021-11-03 24.9 21.0 30.1 0.0 NA 72 4.7 NA 1013.8 NA
## 122 2021-11-04 24.2 19.1 30.2 0.0 NA 41 4.7 NA 1013.2 NA
## 123 2021-11-05 23.9 18.7 30.0 0.0 NA 49 5.4 NA 1013.0 NA
## 124 2021-11-06 22.5 16.7 29.4 0.0 NA 28 5.6 NA 1011.7 NA
## 125 2021-11-07 22.3 15.7 29.3 0.0 NA 25 5.6 NA 1011.9 NA
## 126 2021-11-08 22.2 15.7 29.8 0.0 NA 23 7.0 NA 1012.7 NA
## 127 2021-11-09 22.5 15.4 30.0 0.0 NA 29 6.5 NA 1013.3 NA
## 128 2021-11-10 23.0 15.9 30.5 0.0 NA 20 6.3 NA 1012.5 NA
## 129 2021-11-11 24.6 18.3 31.1 0.0 NA 36 9.0 NA 1011.3 NA
## 130 2021-11-12 25.8 21.6 31.6 9.7 NA 55 5.8 NA 1011.8 NA
## 131 2021-11-13 24.7 23.2 27.1 34.4 NA 34 5.4 NA 1012.1 NA
## 132 2021-11-14 24.6 23.2 28.0 34.4 NA 53 4.2 NA 1011.5 NA
## 133 2021-11-15 24.2 21.0 27.2 14.4 NA 83 7.1 NA 1010.5 NA
## 134 2021-11-16 23.7 20.4 28.7 0.0 NA 43 8.1 NA 1011.5 NA
## 135 2021-11-17 23.0 18.3 28.6 0.0 NA 37 8.3 NA 1013.5 NA
## 136 2021-11-18 22.3 18.0 28.4 0.0 NA 26 6.5 NA 1012.2 NA
## 137 2021-11-19 22.9 17.7 29.0 0.0 NA 37 5.2 NA 1010.4 NA
## 138 2021-11-20 25.3 20.5 30.5 0.0 NA 58 4.9 NA 1010.2 NA
## 139 2021-11-21 25.4 21.6 30.4 0.0 NA 57 4.8 NA 1011.5 NA
## 140 2021-11-22 25.5 22.2 29.9 0.1 NA 194 4.2 NA 1011.7 NA
## 141 2021-11-23 24.7 20.1 30.3 0.0 NA 15 4.0 NA 1011.5 NA
## 142 2021-11-24 23.6 19.1 29.6 0.0 NA 21 5.5 NA 1012.9 NA
## 143 2021-11-25 23.1 17.9 29.8 0.0 NA 37 4.5 NA 1013.2 NA
## 144 2021-11-26 22.4 17.7 28.9 0.0 NA 31 5.1 NA 1013.7 NA
## 145 2021-11-27 21.2 16.5 28.0 0.0 NA 24 6.9 NA 1015.1 NA
## 146 2021-11-28 19.7 13.6 26.9 0.0 NA 25 6.5 NA 1016.3 NA
## 147 2021-11-29 19.6 13.9 27.0 0.0 NA 15 5.7 NA 1016.2 NA
## 148 2021-11-30 19.7 14.2 26.9 0.0 NA 24 7.5 NA 1015.9 NA
## 149 2021-12-01 19.7 13.4 27.3 0.0 NA 33 8.2 NA 1015.4 NA
## 150 2021-12-02 20.2 13.5 27.6 0.0 NA 9 6.2 NA 1014.7 NA
## 151 2021-12-03 21.1 14.7 27.7 0.0 NA 24 6.9 NA 1015.0 NA
## 152 2021-12-04 21.6 20.3 23.3 0.7 NA 32 8.4 NA 1014.7 NA
## 153 2021-12-05 21.6 20.4 24.5 4.7 NA 40 14.5 NA 1013.5 NA
## 154 2021-12-06 22.4 18.4 27.6 0.5 NA 37 8.1 NA 1012.6 NA
## 155 2021-12-07 22.3 17.7 28.1 0.0 NA 41 6.5 NA 1014.6 NA
## 156 2021-12-08 22.2 16.7 28.7 0.0 NA 43 8.1 NA 1016.2 NA
## 157 2021-12-09 22.6 19.1 27.1 0.0 NA 35 6.9 NA 1017.7 NA
## 158 2021-12-10 21.7 18.3 27.8 0.0 NA 69 4.7 NA 1017.6 NA
## 159 2021-12-11 21.6 16.5 27.9 0.0 NA 40 5.8 NA 1017.3 NA
## 160 2021-12-12 21.2 17.4 27.1 0.0 NA 44 6.1 NA 1018.1 NA
## 161 2021-12-13 19.6 14.3 26.2 0.0 NA 346 4.4 NA 1017.0 NA
## 162 2021-12-14 19.0 13.4 26.0 0.0 NA 34 5.0 NA 1016.2 NA
## 163 2021-12-15 18.6 12.9 25.6 0.0 NA 29 6.5 NA 1016.2 NA
## 164 2021-12-16 18.2 12.9 25.1 0.0 NA 44 7.5 NA 1017.0 NA
## 165 2021-12-17 18.6 11.1 25.9 0.0 NA 42 5.5 NA 1017.1 NA
## 166 2021-12-18 19.6 12.9 27.4 0.0 NA 35 6.9 NA 1016.4 NA
## 167 2021-12-19 18.0 13.0 24.0 0.0 NA 13 7.5 NA 1017.6 NA
## 168 2021-12-20 14.6 8.2 21.5 0.0 NA 24 6.4 NA 1018.1 NA
## 169 2021-12-21 15.0 9.1 22.8 0.0 NA 338 5.6 NA 1016.8 NA
## 170 2021-12-22 16.2 9.7 24.5 0.0 NA 17 5.3 NA 1014.8 NA
## 171 2021-12-23 16.6 9.2 25.5 0.0 NA 13 5.7 NA 1014.6 NA
## 172 2021-12-24 17.8 10.4 26.6 0.0 NA 131 5.6 NA 1014.3 NA
## 173 2021-12-25 19.7 12.6 27.9 0.0 NA 138 6.1 NA 1014.2 NA
## 174 2021-12-26 20.2 14.1 27.6 0.0 NA 29 4.7 NA 1016.2 NA
## 175 2021-12-27 20.0 13.7 27.7 0.0 NA 137 5.7 NA 1017.2 NA
## 176 2021-12-28 20.6 15.6 27.2 11.7 NA 100 5.2 NA 1016.7 NA
## 177 2021-12-29 19.3 17.8 21.5 21.9 NA 104 6.2 NA 1017.1 NA
## 178 2021-12-30 19.3 16.3 24.1 3.7 NA 20 6.5 NA 1020.1 NA
## 179 2021-12-31 18.3 14.8 24.0 0.0 NA 42 6.9 NA 1020.6 NA
## 180 2022-01-01 17.6 12.7 23.1 0.0 NA 20 6.1 NA 1020.3 NA
## 181 2022-01-02 17.3 12.0 23.0 0.0 NA 23 5.8 NA 1018.8 NA
## 182 2022-01-03 16.7 11.6 23.8 0.0 NA 32 5.2 NA 1017.4 NA
## 183 2022-01-04 17.1 10.6 25.1 0.0 NA 24 4.7 NA 1017.3 NA
## 184 2022-01-05 18.5 12.9 25.5 0.0 NA 12 3.0 NA 1016.8 NA
## 185 2022-01-06 19.1 13.8 26.5 0.0 NA 4 4.1 NA 1016.0 NA
## 186 2022-01-07 19.7 14.0 27.1 0.0 NA 32 5.9 NA 1016.5 NA
## 187 2022-01-08 19.7 14.1 27.3 0.0 NA 163 4.7 NA 1016.5 NA
## 188 2022-01-09 20.0 15.1 27.7 0.0 NA 93 4.9 NA 1014.4 NA
## 189 2022-01-10 19.9 16.4 23.3 0.5 NA 19 4.9 NA 1015.7 NA
## 190 2022-01-11 19.4 16.9 22.0 15.1 NA 40 3.5 NA 1015.8 NA
## 191 2022-01-12 18.7 16.4 22.1 2.2 NA 154 4.3 NA 1015.3 NA
## 192 2022-01-13 18.9 14.8 24.7 0.0 NA 46 6.5 NA 1015.8 NA
## 193 2022-01-14 18.9 16.1 23.0 0.0 NA 44 8.0 NA 1015.7 NA
## 194 2022-01-15 18.8 15.5 23.8 0.5 NA 32 6.6 NA 1016.4 NA
## 195 2022-01-16 18.0 13.5 23.4 0.0 NA 35 6.5 NA 1018.3 NA
## 196 2022-01-17 17.1 11.4 23.4 0.0 NA 21 5.3 NA 1019.0 NA
## 197 2022-01-18 16.4 10.6 23.2 0.0 NA 5 5.4 NA 1016.7 NA
## 198 2022-01-19 16.6 10.2 23.3 0.0 NA 0 5.8 NA 1014.5 NA
## 199 2022-01-20 16.4 10.8 23.7 0.0 NA 27 4.7 NA 1012.0 NA
## 200 2022-01-21 18.1 11.1 26.0 0.2 NA 61 4.3 NA 1011.1 NA
## 201 2022-01-22 19.5 13.9 26.8 0.0 NA 110 4.4 NA 1009.7 NA
## 202 2022-01-23 19.6 16.7 23.9 4.8 NA 140 4.8 NA 1009.4 NA
## 203 2022-01-24 20.2 16.1 25.7 0.7 NA 316 4.6 NA 1010.6 NA
## 204 2022-01-25 18.5 14.4 23.6 0.0 NA 22 6.6 NA 1012.3 NA
## 205 2022-01-26 18.5 14.6 24.0 0.2 NA 349 7.8 NA 1013.3 NA
## 206 2022-01-27 16.3 11.7 22.2 0.0 NA 349 7.2 NA 1015.4 NA
## 207 2022-01-28 14.9 9.2 21.6 0.0 NA 4 6.3 NA 1016.7 NA
## 208 2022-01-29 16.0 8.4 23.0 0.0 NA 5 5.1 NA 1016.5 NA
## 209 2022-01-30 16.6 9.7 25.2 0.0 NA 204 4.5 NA 1013.2 NA
## 210 2022-01-31 18.1 11.1 27.4 0.0 NA 201 4.6 NA 1011.9 NA
## 211 2022-02-01 18.9 12.1 27.5 0.0 NA 129 4.6 NA 1010.5 NA
## 212 2022-02-02 20.2 12.7 28.8 0.0 NA 75 4.7 NA 1011.9 NA
## 213 2022-02-03 21.2 13.2 29.0 0.0 NA 137 9.6 NA 1010.6 NA
## 214 2022-02-04 20.4 15.1 25.6 22.1 NA 212 10.6 NA 1008.9 NA
## 215 2022-02-05 17.7 13.1 24.3 0.0 NA 0 6.3 NA 1012.7 NA
## 216 2022-02-06 17.3 10.3 25.5 0.0 NA 351 5.9 NA 1016.0 NA
## 217 2022-02-07 18.3 11.7 26.9 0.0 NA 342 5.2 NA 1015.2 NA
## 218 2022-02-08 18.9 11.4 27.5 0.0 NA 26 5.3 NA 1013.9 NA
## 219 2022-02-09 21.1 13.5 28.3 0.0 NA 154 10.1 NA 1013.5 NA
## 220 2022-02-10 21.2 17.2 26.1 0.7 NA 129 6.2 NA 1013.7 NA
## 221 2022-02-11 18.9 13.0 25.4 0.0 NA 338 7.4 NA 1014.8 NA
## 222 2022-02-12 17.9 11.3 25.8 0.0 NA 29 6.0 NA 1014.6 NA
## 223 2022-02-13 17.5 9.7 26.1 0.0 NA 9 5.4 NA 1013.6 NA
## 224 2022-02-14 18.4 10.6 27.5 0.0 NA 8 5.1 NA 1012.9 NA
## 225 2022-02-15 19.6 12.6 28.1 0.0 NA 53 5.5 NA 1012.3 NA
## 226 2022-02-16 21.1 13.4 28.9 0.0 NA 53 5.4 NA 1010.1 NA
## 227 2022-02-17 21.7 16.1 29.2 0.0 NA 71 5.8 NA 1010.1 NA
## 228 2022-02-18 21.7 14.9 29.6 0.0 NA 45 6.8 NA 1011.6 NA
## 229 2022-02-19 22.7 14.5 30.4 0.0 NA 321 5.8 NA 1012.1 NA
## 230 2022-02-20 24.4 18.6 31.2 0.0 NA 249 9.1 NA 1008.2 NA
## 231 2022-02-21 22.2 14.1 29.9 0.0 NA 187 8.1 NA 1009.0 NA
## 232 2022-02-22 21.0 13.0 29.6 0.0 NA 35 5.8 NA 1013.2 NA
## 233 2022-02-23 21.9 14.4 31.2 0.0 NA 143 6.2 NA 1015.7 NA
## 234 2022-02-24 24.4 18.2 32.5 0.0 NA 172 9.1 NA 1014.9 NA
## 235 2022-02-25 24.8 18.1 32.8 0.0 NA 175 8.0 NA 1014.3 NA
## 236 2022-02-26 24.4 17.8 32.1 0.0 NA 21 7.1 NA 1015.8 NA
## 237 2022-02-27 25.0 17.6 33.2 0.0 NA 102 6.1 NA 1015.0 NA
## 238 2022-02-28 24.6 17.8 32.3 0.0 NA 39 5.5 NA 1014.3 NA
## 239 2022-03-01 25.3 18.1 32.8 0.0 NA 8 6.4 NA 1014.6 NA
## 240 2022-03-02 25.3 17.9 33.0 0.0 NA 38 5.2 NA 1014.4 NA
## 241 2022-03-03 25.1 17.7 33.0 0.0 NA 67 4.9 NA 1014.3 NA
## 242 2022-03-04 25.1 17.5 33.6 0.0 NA 347 6.5 NA 1013.4 NA
## 243 2022-03-05 25.6 18.4 33.6 0.0 NA 209 7.6 NA 1011.8 NA
## 244 2022-03-06 25.1 16.9 33.0 0.0 NA 29 6.9 NA 1012.5 NA
## 245 2022-03-07 25.3 17.5 33.5 0.0 NA 20 5.2 NA 1012.9 NA
## 246 2022-03-08 25.5 18.4 33.6 0.0 NA 6 6.1 NA 1011.8 NA
## 247 2022-03-09 25.3 16.9 33.9 0.0 NA 32 6.4 NA 1011.6 NA
## 248 2022-03-10 26.1 17.5 35.1 0.0 NA 40 8.1 NA 1011.1 NA
## 249 2022-03-11 26.6 18.1 34.7 0.0 NA 7 6.5 NA 1010.8 NA
## 250 2022-03-12 26.8 19.8 34.7 0.0 NA 331 7.0 NA 1010.3 NA
## 251 2022-03-13 25.8 18.2 34.3 0.0 NA 17 6.0 NA 1010.7 NA
## 252 2022-03-14 26.2 17.6 35.1 0.0 NA 332 6.3 NA 1010.3 NA
## 253 2022-03-15 27.5 18.5 36.4 0.0 NA 273 4.7 NA 1007.5 NA
## 254 2022-03-16 28.1 21.0 36.6 0.0 NA 134 5.7 NA 1005.5 NA
## 255 2022-03-17 29.3 21.0 37.9 0.0 NA 159 6.6 NA 1005.7 NA
## 256 2022-03-18 29.6 21.5 37.8 0.0 NA 161 7.1 NA 1007.0 NA
## 257 2022-03-19 30.1 24.2 37.0 0.0 NA 189 6.8 NA 1005.7 NA
## 258 2022-03-20 29.9 22.1 38.1 0.0 NA 185 6.8 NA 1004.6 NA
## 259 2022-03-21 30.4 24.0 37.7 0.0 NA 174 7.5 NA 1005.7 NA
## 260 2022-03-22 30.3 23.8 38.3 0.0 NA 168 7.6 NA 1006.0 NA
## 261 2022-03-23 29.4 21.7 38.0 0.0 NA 134 5.7 NA 1006.6 NA
## 262 2022-03-24 29.4 20.4 38.5 0.0 NA 195 5.8 NA 1006.4 NA
## 263 2022-03-25 30.7 23.9 38.2 0.0 NA 202 9.9 NA 1006.6 NA
## 264 2022-03-26 32.0 26.4 38.6 0.0 NA 212 8.0 NA 1007.6 NA
## 265 2022-03-27 30.1 21.3 38.4 0.0 NA 312 6.9 NA 1009.1 NA
## 266 2022-03-28 29.4 20.8 38.3 0.0 NA 178 7.2 NA 1007.9 NA
## 267 2022-03-29 30.1 19.9 39.9 0.0 NA 180 7.7 NA 1005.9 NA
## 268 2022-03-30 30.2 21.5 40.0 0.0 NA 167 7.1 NA 1005.7 NA
## 269 2022-03-31 31.2 22.5 40.4 0.0 NA 175 9.0 NA 1006.5 NA
## 270 2022-04-01 31.8 24.6 40.6 0.0 NA 181 11.1 NA 1006.3 NA
## 271 2022-04-02 32.0 24.8 40.3 0.0 NA 170 7.0 NA 1007.5 NA
## 272 2022-04-03 31.5 24.7 40.2 0.0 NA 177 7.1 NA 1007.6 NA
## 273 2022-04-04 31.8 25.1 40.6 0.0 NA 188 13.1 NA 1008.1 NA
## 274 2022-04-05 31.8 24.6 40.5 0.0 NA 187 10.6 NA 1010.2 NA
## 275 2022-04-06 31.7 25.0 40.1 0.0 NA 184 8.2 NA 1010.4 NA
## 276 2022-04-07 31.9 24.2 39.9 0.0 NA 182 7.8 NA 1009.5 NA
## 277 2022-04-08 32.6 25.6 40.5 0.0 NA 181 10.7 NA 1007.5 NA
## 278 2022-04-09 33.0 25.8 40.6 0.0 NA 190 11.8 NA 1006.1 NA
## 279 2022-04-10 33.5 25.3 41.4 0.0 NA 195 13.3 NA 1004.7 NA
## 280 2022-04-11 33.5 26.2 42.2 0.0 NA 192 10.6 NA 1003.9 NA
## 281 2022-04-12 33.6 24.5 41.6 0.0 NA 176 8.8 NA 1004.0 NA
## 282 2022-04-13 33.9 26.2 41.8 0.0 NA 197 11.3 NA 1003.1 NA
## 283 2022-04-14 34.6 28.9 41.4 0.0 NA 199 11.2 NA 1002.8 NA
## 284 2022-04-15 34.0 26.7 41.1 0.0 NA 191 11.4 NA 1002.8 NA
## 285 2022-04-16 33.2 25.5 41.1 0.0 NA 185 8.7 NA 1002.6 NA
## 286 2022-04-17 33.1 25.8 41.2 0.0 NA 194 7.3 NA 1003.3 NA
## 287 2022-04-18 33.4 25.0 42.1 0.0 NA 206 7.1 NA 1005.1 NA
## 288 2022-04-19 34.0 24.8 42.6 0.0 NA 189 10.0 NA 1005.8 NA
## 289 2022-04-20 34.0 26.2 42.2 0.0 NA 171 9.5 NA 1004.7 NA
## 290 2022-04-21 31.7 26.4 39.8 0.7 NA 161 8.9 NA 1005.2 NA
## 291 2022-04-22 32.6 26.8 39.7 0.0 NA 172 7.3 NA 1006.6 NA
## 292 2022-04-23 33.4 25.9 40.7 0.0 NA 193 7.1 NA 1006.2 NA
## 293 2022-04-24 34.1 27.0 41.7 0.0 NA 189 8.3 NA 1004.4 NA
## 294 2022-04-25 33.3 23.8 41.6 0.0 NA 181 5.8 NA 1004.1 NA
## 295 2022-04-26 33.3 24.4 41.9 0.0 NA 179 7.1 NA 1004.6 NA
## 296 2022-04-27 33.7 26.1 42.1 0.0 NA 184 8.4 NA 1004.1 NA
## 297 2022-04-28 34.3 26.0 42.9 0.0 NA 179 7.3 NA 1004.6 NA
## 298 2022-04-29 34.4 26.4 42.7 0.0 NA 182 8.2 NA 1004.2 NA
## 299 2022-04-30 35.0 27.0 43.6 0.0 NA 190 13.0 NA 1002.1 NA
## 300 2022-05-01 32.1 26.4 41.2 0.0 NA 146 10.0 NA 1002.0 NA
## 301 2022-05-02 31.5 27.4 40.3 0.0 NA 165 10.6 NA 1002.9 NA
## 302 2022-05-03 32.5 26.8 41.3 0.0 NA 173 13.6 NA 1003.9 NA
## 303 2022-05-04 30.4 25.6 37.2 1.5 NA 146 6.5 NA 1005.4 NA
## 304 2022-05-05 30.5 25.3 36.4 0.8 NA 135 6.3 NA 1005.4 NA
## 305 2022-05-06 31.4 25.3 38.0 0.0 NA 123 6.3 NA 1005.2 NA
## 306 2022-05-07 31.9 25.8 38.8 0.0 NA 157 6.4 NA 1004.6 NA
## 307 2022-05-08 31.9 25.6 38.2 0.0 NA 142 6.4 NA 1003.8 NA
## 308 2022-05-09 31.3 24.8 37.7 0.0 NA 108 5.9 NA 1003.6 NA
## 309 2022-05-10 30.5 26.8 37.1 5.6 NA 110 5.7 NA 1003.6 NA
## 310 2022-05-11 29.6 26.4 34.6 3.1 NA 159 9.1 NA 1003.6 NA
## 311 2022-05-12 31.2 26.8 36.6 0.3 NA 183 16.5 NA 1002.6 NA
## 312 2022-05-13 33.3 27.2 39.9 0.0 NA 199 14.8 NA 1000.2 NA
## 313 2022-05-14 34.2 27.8 42.0 0.0 NA 192 17.1 NA 1000.1 NA
## 314 2022-05-15 32.9 28.1 41.9 0.0 NA 183 16.7 NA 1001.9 NA
## 315 2022-05-16 33.7 27.8 41.5 0.0 NA 191 13.4 NA 1001.7 NA
## 316 2022-05-17 33.1 28.2 39.6 0.0 NA 188 14.4 NA 1001.5 NA
## 317 2022-05-18 32.3 28.1 38.3 0.0 NA 179 13.7 NA 1002.9 NA
## 318 2022-05-19 31.8 27.9 37.8 0.3 NA 188 11.2 NA 1003.9 NA
## 319 2022-05-20 33.4 27.4 39.9 0.0 NA 203 11.9 NA 1001.2 NA
## 320 2022-05-21 32.5 28.1 41.5 0.1 NA 190 12.8 NA 998.5 NA
## 321 2022-05-22 31.0 25.8 38.9 3.3 NA 172 10.1 NA 997.5 NA
## 322 2022-05-23 30.8 25.5 36.3 0.9 NA 193 6.6 NA 999.0 NA
## 323 2022-05-24 30.2 25.4 36.7 11.5 NA 157 6.4 NA 1003.2 NA
## 324 2022-05-25 29.8 24.7 36.0 0.0 NA 181 5.5 NA 1004.6 NA
## 325 2022-05-26 NA NA NA NA NA NA NA NA NA NA
## 326 2022-05-27 NA NA NA NA NA NA NA NA NA NA
## 327 2022-05-28 31.0 25.4 37.2 0.0 NA 220 7.6 NA 1000.6 NA
## 328 2022-05-29 32.1 26.3 39.2 0.0 NA 182 5.5 NA 1000.2 NA
## 329 2022-05-30 32.5 28.1 38.0 0.0 NA 223 6.7 NA 1000.2 NA
## 330 2022-05-31 33.3 26.9 39.8 0.0 NA 182 6.1 NA 999.0 NA
## 331 2022-06-01 32.3 26.9 39.7 0.0 NA 152 6.3 NA 999.6 NA
## 332 2022-06-02 33.0 27.3 40.9 0.0 NA 161 7.1 NA 999.5 NA
## 333 2022-06-03 33.0 27.7 39.9 0.0 NA 183 9.5 NA 999.4 NA
## 334 2022-06-04 32.0 28.0 39.5 2.5 NA 171 10.7 NA 1000.3 NA
## 335 2022-06-05 30.9 27.0 39.3 4.2 NA 168 10.8 NA 1000.9 NA
## 336 2022-06-06 30.8 26.3 37.3 0.0 NA 166 9.8 NA 1000.6 NA
## 337 2022-06-07 32.4 27.5 39.9 0.0 NA 186 12.0 NA 999.4 NA
## 338 2022-06-08 33.0 27.1 40.7 0.0 NA 179 9.1 NA 998.4 NA
## 339 2022-06-09 32.3 28.4 38.6 6.5 NA 175 11.7 NA 998.5 NA
## 340 2022-06-10 33.2 29.3 39.1 0.3 NA 194 9.8 NA 999.6 NA
## 341 2022-06-11 33.2 28.2 39.8 0.0 NA 205 12.1 NA 1000.5 NA
## 342 2022-06-12 33.4 28.0 40.2 0.0 NA 201 10.7 NA 1001.4 NA
## 343 2022-06-13 32.4 28.5 38.4 0.0 NA 193 8.5 NA 1003.0 NA
## 344 2022-06-14 33.2 28.4 39.9 0.7 NA 193 9.5 NA 1002.1 NA
## 345 2022-06-15 30.2 27.6 34.2 6.2 NA 215 8.7 NA 1003.3 NA
## 346 2022-06-16 30.4 27.0 36.3 4.0 NA 201 10.8 NA 1003.8 NA
## 347 2022-06-17 28.5 26.8 33.5 15.5 NA 189 11.1 NA 1004.3 NA
## 348 2022-06-18 28.2 26.3 32.2 22.3 NA 200 8.9 NA 1004.7 NA
## 349 2022-06-19 27.7 26.3 31.9 11.7 NA 201 9.0 NA 1003.9 NA
## 350 2022-06-20 28.1 26.3 31.6 16.3 NA 200 8.7 NA 1002.1 NA
## 351 2022-06-21 27.8 25.3 32.1 3.2 NA 184 7.0 NA 1000.8 NA
## 352 2022-06-22 28.0 24.6 33.2 1.0 NA 171 3.4 NA 1002.9 NA
## 353 2022-06-23 28.5 25.8 31.7 7.1 NA 196 6.3 NA 1003.8 NA
## 354 2022-06-24 27.5 25.8 30.6 33.1 NA 184 5.8 NA 1002.3 NA
## 355 2022-06-25 27.8 25.7 31.0 2.6 NA 188 5.1 NA 1000.4 NA
## 356 2022-06-26 28.6 25.3 32.9 2.7 NA 159 7.2 NA 1002.0 NA
## 357 2022-06-27 29.8 26.1 33.9 0.1 NA 173 10.4 NA 1003.5 NA
## 358 2022-06-28 30.0 26.4 34.0 0.2 NA 185 10.2 NA 1002.1 NA
## 359 2022-06-29 29.5 26.5 33.7 5.3 NA 201 6.4 NA 1000.7 NA
## 360 2022-06-30 27.9 25.9 31.3 23.5 NA 187 5.5 NA 1000.5 NA
## 361 2022-07-01 28.4 25.9 32.0 22.2 NA 220 5.2 NA 1000.2 NA
## 362 2022-07-02 28.3 26.1 32.4 26.1 NA 219 5.2 NA 999.1 NA
## 363 2022-07-03 27.5 26.1 31.1 18.9 NA 186 5.2 NA 999.0 NA
## 364 2022-07-04 28.4 25.5 32.5 4.3 NA 153 5.4 NA 998.7 NA
## 365 2022-07-05 28.3 26.0 32.2 23.1 NA 156 5.1 NA 998.9 NA
## 366 2022-07-06 27.8 25.9 30.8 9.5 NA 122 6.7 NA 1000.6 NA
## 367 2022-07-07 29.3 25.8 34.4 1.5 NA 125 5.3 NA 1002.6 NA
## 368 2022-07-08 28.9 25.9 34.3 3.4 NA 90 7.3 NA 1000.3 NA
## 369 2022-07-09 29.0 26.1 33.1 6.4 NA 66 9.8 NA 997.9 NA
## 370 2022-07-10 29.0 26.3 33.3 2.8 NA 60 9.7 NA 997.3 NA
## 371 2022-07-11 29.0 26.1 33.1 2.6 NA 52 12.2 NA 997.9 NA
## 372 2022-07-12 29.1 26.8 32.2 1.8 NA 49 16.1 NA 996.4 NA
## 373 2022-07-13 29.2 26.8 32.8 4.7 NA 59 13.8 NA 994.9 NA
## 374 2022-07-14 28.0 26.5 30.5 27.6 NA 75 10.8 NA 995.3 NA
## 375 2022-07-15 27.7 25.8 31.6 24.2 NA 64 6.5 NA 998.4 NA
## 376 2022-07-16 27.9 25.7 31.9 15.0 NA 45 5.3 NA 999.6 NA
## 377 2022-07-17 28.7 26.4 32.0 11.4 NA 93 11.5 NA 999.8 NA
## 378 2022-07-18 28.6 25.6 32.5 2.1 NA 162 10.0 NA 1001.3 NA
## 379 2022-07-19 29.1 26.3 31.9 1.5 NA 218 10.9 NA 1002.0 NA
## 380 2022-07-20 29.2 26.0 32.8 10.1 NA 231 8.4 NA 1001.9 NA
## 381 2022-07-21 28.6 26.0 32.5 7.0 NA 217 5.2 NA 1001.9 NA
## 382 2022-07-22 27.8 26.2 31.8 26.6 NA 140 3.4 NA 1002.4 NA
## 383 2022-07-23 27.6 25.7 30.7 14.2 NA 120 3.2 NA 1001.9 NA
## 384 2022-07-24 27.2 25.5 29.8 20.5 NA 148 5.8 NA 1002.6 NA
## 385 2022-07-25 27.6 25.0 31.0 12.1 NA 172 8.9 NA 1005.4 NA
## 386 2022-07-26 28.4 25.5 32.2 3.2 NA 174 10.0 NA 1007.2 NA
## 387 2022-07-27 28.6 25.7 32.1 1.0 NA 180 8.1 NA 1007.2 NA
## 388 2022-07-28 28.9 26.0 31.8 5.5 NA 197 5.3 NA 1007.4 NA
## 389 2022-07-29 28.6 25.8 32.5 10.6 NA 212 5.8 NA 1006.4 NA
## 390 2022-07-30 28.9 26.2 34.0 4.6 NA 200 6.2 NA 1004.8 NA
## 391 2022-07-31 28.6 25.9 32.8 1.8 NA 204 5.1 NA 1003.0 NA
## 392 2022-08-01 28.8 26.1 33.0 7.3 NA 206 6.0 NA 1002.8 NA
## 393 2022-08-02 28.9 25.7 33.4 6.8 NA 208 4.6 NA 1002.4 NA
## 394 2022-08-03 28.5 26.5 31.3 12.2 NA 136 4.0 NA 1003.0 NA
## 395 2022-08-04 28.2 26.0 31.6 12.4 NA 81 3.8 NA 1003.1 NA
## 396 2022-08-05 28.8 25.6 32.5 3.4 NA 107 5.5 NA 1001.3 NA
## 397 2022-08-06 28.9 26.5 32.8 13.9 NA 359 4.4 NA 999.0 NA
## 398 2022-08-07 28.5 26.3 31.9 6.0 NA 48 11.5 NA 997.2 NA
## 399 2022-08-08 29.0 26.4 33.6 6.8 NA 57 13.5 NA 995.4 NA
## 400 2022-08-09 28.0 26.7 31.1 8.0 NA 62 16.4 NA 993.1 NA
## 401 2022-08-10 26.5 25.9 27.8 14.4 NA 153 11.6 NA 994.6 NA
## 402 2022-08-11 26.3 25.1 28.0 30.5 NA 178 10.0 NA 999.4 NA
## 403 2022-08-12 26.4 24.8 30.6 46.6 NA 212 7.8 NA 1001.0 NA
## 404 2022-08-13 27.3 25.2 31.1 8.0 NA 278 7.0 NA 999.4 NA
## 405 2022-08-14 26.8 25.1 29.1 88.4 NA 19 11.2 NA 996.5 NA
## 406 2022-08-15 27.3 25.6 29.7 28.3 NA 170 18.5 NA 1001.0 NA
## 407 2022-08-16 28.6 24.6 33.3 1.6 NA 208 7.8 NA 1005.7 NA
## 408 2022-08-17 29.0 24.8 33.3 0.1 NA 236 6.8 NA 1004.9 NA
## 409 2022-08-18 28.3 25.9 33.1 20.2 NA 279 6.3 NA 1002.5 NA
## 410 2022-08-19 27.0 25.4 29.9 26.9 NA 296 8.0 NA 1001.0 NA
## 411 2022-08-20 25.8 25.2 26.4 123.0 NA 220 19.8 NA 996.8 NA
## 412 2022-08-21 27.1 25.1 31.1 20.0 NA 172 13.8 NA 999.7 NA
## 413 2022-08-22 28.4 25.1 33.1 0.4 NA 180 9.2 NA 1001.2 NA
## 414 2022-08-23 28.7 24.9 33.3 3.6 NA 318 6.2 NA 1003.7 NA
## 415 2022-08-24 26.7 25.3 30.0 17.9 NA 248 4.2 NA 1005.3 NA
## 416 2022-08-25 28.4 24.9 32.5 3.2 NA 210 6.2 NA 1003.9 NA
## 417 2022-08-26 29.4 25.3 33.8 0.0 NA 219 6.1 NA 1003.2 NA
## 418 2022-08-27 28.7 26.1 33.7 2.7 NA 193 7.7 NA 1003.3 NA
## 419 2022-08-28 28.3 25.8 32.0 1.8 NA 177 6.1 NA 1004.7 NA
## 420 2022-08-29 28.9 25.7 33.2 1.9 NA 166 5.1 NA 1007.4 NA
## 421 2022-08-30 29.8 26.4 34.3 0.0 NA 174 7.6 NA 1007.9 NA
## 422 2022-08-31 29.0 26.6 33.5 2.0 NA 187 8.6 NA 1006.8 NA
## 423 2022-09-01 29.1 25.7 33.2 11.5 NA 205 6.7 NA 1007.2 NA
## 424 2022-09-02 29.4 26.4 33.7 1.5 NA 189 7.0 NA 1007.5 NA
## 425 2022-09-03 28.7 26.6 32.6 8.0 NA 203 8.0 NA 1005.8 NA
## 426 2022-09-04 28.2 25.9 31.8 17.7 NA 211 6.8 NA 1004.8 NA
miss_scan_count(data = Weather_Rourkela, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 11 × 2
## Variable n
## <chr> <int>
## 1 time 0
## 2 tavg 0
## 3 tmin 0
## 4 tmax 0
## 5 prcp 0
## 6 snow 0
## 7 wdir 0
## 8 wspd 0
## 9 wpgt 0
## 10 pres 0
## 11 tsun 0
##Create shadow matrix data
as_shadow(Weather_Rourkela)
## # A tibble: 426 × 11
## time_NA tavg_NA tmin_NA tmax_NA prcp_NA snow_NA wdir_NA wspd_NA wpgt_NA
## <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 !NA !NA !NA !NA NA NA !NA !NA NA
## 2 !NA !NA !NA !NA !NA NA !NA !NA NA
## 3 !NA !NA !NA !NA !NA NA !NA !NA NA
## 4 !NA !NA !NA !NA !NA NA !NA !NA NA
## 5 !NA !NA !NA !NA !NA NA !NA !NA NA
## 6 !NA !NA !NA !NA !NA NA !NA !NA NA
## 7 !NA !NA !NA !NA !NA NA !NA !NA NA
## 8 !NA !NA !NA !NA !NA NA !NA !NA NA
## 9 !NA !NA !NA !NA !NA NA !NA !NA NA
## 10 !NA !NA !NA !NA !NA NA !NA !NA NA
## # ℹ 416 more rows
## # ℹ 2 more variables: pres_NA <fct>, tsun_NA <fct>
#Create nabular data by binding the shadow to the data
bind_shadow(Weather_Rourkela, only_miss = TRUE)
## # A tibble: 426 × 21
## time tavg tmin tmax prcp snow wdir wspd wpgt pres tsun tavg_NA
## <chr> <dbl> <dbl> <dbl> <dbl> <lgl> <dbl> <dbl> <lgl> <dbl> <lgl> <fct>
## 1 2021-07-… 29.3 26.2 32.6 NA NA 197 6.8 NA 1002. NA !NA
## 2 2021-07-… 29.7 27.3 33.4 11.1 NA 199 6.9 NA 1002. NA !NA
## 3 2021-07-… 27.4 25.8 29.7 66.9 NA 186 6.3 NA 1002. NA !NA
## 4 2021-07-… 28.5 26.1 32.1 11.4 NA 173 3.9 NA 1001 NA !NA
## 5 2021-07-… 29 26.2 32.6 2.7 NA 121 4.6 NA 1001. NA !NA
## 6 2021-07-… 29.3 26.2 33.7 10.8 NA 70 5.8 NA 1002. NA !NA
## 7 2021-07-… 28.9 25.7 32.9 5.4 NA 95 7 NA 1003. NA !NA
## 8 2021-07-… 28.6 25.5 32.5 10.1 NA 101 5.5 NA 1003. NA !NA
## 9 2021-07-… 29 25.4 32.7 1.9 NA 138 6.5 NA 1003. NA !NA
## 10 2021-07-… 29.5 25.5 33.4 1.3 NA 152 8.7 NA 1004 NA !NA
## # ℹ 416 more rows
## # ℹ 9 more variables: tmin_NA <fct>, tmax_NA <fct>, prcp_NA <fct>,
## # snow_NA <fct>, wdir_NA <fct>, wspd_NA <fct>, wpgt_NA <fct>, pres_NA <fct>,
## # tsun_NA <fct>
# Lets explore the relations ship with the missing values
Weather_Rourkela %>%
bind_shadow(only_miss = TRUE) %>%
group_by(prcp_NA) %>%
summarise(tavg_mean = mean(tavg),tavg_sd = sd(tavg))
## # A tibble: 2 × 3
## prcp_NA tavg_mean tavg_sd
## <fct> <dbl> <dbl>
## 1 !NA 26.7 4.61
## 2 NA NA NA
# After adding NA, there the SD and mean has also become NA
bind_shadow(Weather_Rourkela) %>%
ggplot(aes(x = tavg,
color = prcp_NA)) +
geom_density() +
facet_wrap(~tmin_NA)
## Warning: Removed 2 rows containing non-finite values (`stat_density()`).
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(Weather_Rourkela, aes(x = tavg,y = prcp)) + geom_miss_point()
ggplot(Weather_Rourkela, aes(x = tavg,y = prcp)) + geom_miss_point() +
facet_wrap(~year(dmy(time)))
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
# Looks like there are not too much of missing data
##Analysing AQI stations: stations.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(AQ_stations)
## [1] 230 5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(AQ_stations)
## Rows: 230
## Columns: 5
## $ StationId <chr> "AP001", "AP002", "AP003", "AP004", "AP005", "AS001", "BR0…
## $ StationName <chr> "Secretariat, Amaravati - APPCB", "Anand Kala Kshetram, Ra…
## $ City <chr> "Amaravati", "Rajamahendravaram", "Tirupati", "Vijayawada"…
## $ State <chr> "Andhra Pradesh", "Andhra Pradesh", "Andhra Pradesh", "And…
## $ Status <chr> "Active", "", "", "", "Active", "Active", "", "", "", "", …
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(AQ_stations)
## [1] "StationId" "StationName" "City" "State" "Status"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(AQ_stations)
## 'data.frame': 230 obs. of 5 variables:
## $ StationId : chr "AP001" "AP002" "AP003" "AP004" ...
## $ StationName: chr "Secretariat, Amaravati - APPCB" "Anand Kala Kshetram, Rajamahendravaram - APPCB" "Tirumala, Tirupati - APPCB" "PWD Grounds, Vijayawada - APPCB" ...
## $ City : chr "Amaravati" "Rajamahendravaram" "Tirupati" "Vijayawada" ...
## $ State : chr "Andhra Pradesh" "Andhra Pradesh" "Andhra Pradesh" "Andhra Pradesh" ...
## $ Status : chr "Active" "" "" "" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(AQ_stations)
## StationId StationName City State
## Length:230 Length:230 Length:230 Length:230
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Status
## Length:230
## Class :character
## Mode :character
attach(AQ_stations)
AQ_stations [AQ_stations == ""] <- NA
## There is no records with NA but there are records with missing data.
## Lets fill them with NA and then find it.
AQ_stations[is.na(Status),]
## [1] StationId StationName City State Status
## <0 rows> (or 0-length row.names)
##Analysing and Performing Imputations on AQI Station Hour wise - station_hour.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(AQ_station_hour)
## [1] 2589083 16
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(AQ_station_hour)
## Rows: 2,589,083
## Columns: 16
## $ StationId <chr> "AP001", "AP001", "AP001", "AP001", "AP001", "AP001", "AP00…
## $ Datetime <chr> "2017-11-24 17:00:00", "2017-11-24 18:00:00", "2017-11-24 1…
## $ PM2.5 <dbl> 60.50, 65.50, 80.00, 81.50, 75.25, 69.25, 67.50, 68.00, 73.…
## $ PM10 <dbl> 98.00, 111.25, 132.00, 133.25, 116.00, 108.25, 111.50, 111.…
## $ NO <dbl> 2.35, 2.70, 2.10, 1.95, 1.43, 0.70, 1.05, 1.25, 0.30, 0.80,…
## $ NO2 <dbl> 30.80, 24.20, 25.18, 16.25, 17.48, 18.47, 12.15, 14.12, 14.…
## $ NOx <dbl> 18.25, 15.07, 15.15, 10.23, 10.43, 10.38, 7.30, 8.50, 7.90,…
## $ NH3 <dbl> 8.50, 9.77, 12.02, 11.58, 12.03, 13.80, 17.65, 20.28, 11.50…
## $ CO <dbl> 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.1, 0.1, 0.1,…
## $ SO2 <dbl> 11.85, 13.17, 12.08, 10.47, 9.12, 9.25, 9.40, 8.90, 11.80, …
## $ O3 <dbl> 126.40, 117.12, 98.98, 112.20, 106.35, 91.10, 112.70, 116.1…
## $ Benzene <dbl> 0.10, 0.10, 0.20, 0.20, 0.20, 0.20, 0.20, 0.20, 0.20, 0.23,…
## $ Toluene <dbl> 6.10, 6.25, 5.98, 6.72, 5.75, 5.02, 5.60, 5.55, 6.60, 6.77,…
## $ Xylene <dbl> 0.10, 0.15, 0.18, 0.10, 0.08, 0.00, 0.10, 0.05, 0.00, 0.10,…
## $ AQI <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ AQI_Bucket <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(AQ_station_hour)
## [1] "StationId" "Datetime" "PM2.5" "PM10" "NO"
## [6] "NO2" "NOx" "NH3" "CO" "SO2"
## [11] "O3" "Benzene" "Toluene" "Xylene" "AQI"
## [16] "AQI_Bucket"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(AQ_station_hour)
## 'data.frame': 2589083 obs. of 16 variables:
## $ StationId : chr "AP001" "AP001" "AP001" "AP001" ...
## $ Datetime : chr "2017-11-24 17:00:00" "2017-11-24 18:00:00" "2017-11-24 19:00:00" "2017-11-24 20:00:00" ...
## $ PM2.5 : num 60.5 65.5 80 81.5 75.2 ...
## $ PM10 : num 98 111 132 133 116 ...
## $ NO : num 2.35 2.7 2.1 1.95 1.43 0.7 1.05 1.25 0.3 0.8 ...
## $ NO2 : num 30.8 24.2 25.2 16.2 17.5 ...
## $ NOx : num 18.2 15.1 15.2 10.2 10.4 ...
## $ NH3 : num 8.5 9.77 12.02 11.58 12.03 ...
## $ CO : num 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.3 0.1 ...
## $ SO2 : num 11.85 13.17 12.08 10.47 9.12 ...
## $ O3 : num 126 117 99 112 106 ...
## $ Benzene : num 0.1 0.1 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.23 ...
## $ Toluene : num 6.1 6.25 5.98 6.72 5.75 5.02 5.6 5.55 6.6 6.77 ...
## $ Xylene : num 0.1 0.15 0.18 0.1 0.08 0 0.1 0.05 0 0.1 ...
## $ AQI : num NA NA NA NA NA NA NA NA NA NA ...
## $ AQI_Bucket: chr "" "" "" "" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(AQ_station_hour)
## StationId Datetime PM2.5 PM10
## Length:2589083 Length:2589083 Min. : 0.0 Min. : 0.0
## Class :character Class :character 1st Qu.: 28.2 1st Qu.: 64.0
## Mode :character Mode :character Median : 52.6 Median : 116.2
## Mean : 80.9 Mean : 158.5
## 3rd Qu.: 97.7 3rd Qu.: 204.0
## Max. :1000.0 Max. :1000.0
## NA's :647689 NA's :1119252
## NO NO2 NOx NH3
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 3.0 1st Qu.: 13.1 1st Qu.: 11.3 1st Qu.: 11.2
## Median : 7.2 Median : 24.8 Median : 22.9 Median : 22.4
## Mean : 22.8 Mean : 35.2 Mean : 40.6 Mean : 28.7
## 3rd Qu.: 18.6 3rd Qu.: 45.5 3rd Qu.: 45.7 3rd Qu.: 37.8
## Max. :500.0 Max. :500.0 Max. :500.0 Max. :500.0
## NA's :553711 NA's :528973 NA's :490808 NA's :1236618
## CO SO2 O3 Benzene
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.4 1st Qu.: 4.2 1st Qu.: 11.0 1st Qu.: 0.1
## Median : 0.8 Median : 8.2 Median : 24.8 Median : 1.0
## Mean : 1.5 Mean : 12.1 Mean : 38.1 Mean : 3.3
## 3rd Qu.: 1.4 3rd Qu.: 14.5 3rd Qu.: 49.5 3rd Qu.: 3.2
## Max. :498.6 Max. :200.0 Max. :997.0 Max. :498.1
## NA's :499302 NA's :742737 NA's :725973 NA's :861579
## Toluene Xylene AQI AQI_Bucket
## Min. : 0.0 Min. : 0.0 Min. : 5.0 Length:2589083
## 1st Qu.: 0.3 1st Qu.: 0.0 1st Qu.: 84.0 Class :character
## Median : 3.4 Median : 0.2 Median : 131.0 Mode :character
## Mean : 14.9 Mean : 2.4 Mean : 180.2
## 3rd Qu.: 15.1 3rd Qu.: 1.8 3rd Qu.: 259.0
## Max. :500.0 Max. :500.0 Max. :3133.0
## NA's :1042366 NA's :2075104 NA's :570190
attach(AQ_station_hour)
## The following object is masked from AQ_stations:
##
## StationId
AQ_station_hour [AQ_station_hour == ""] <- NA
## So the air quality seems to be dependent on 12 parameters
## There are too many NAs/missing data amongst them:
## PM2.5:647689 PM10:1119252 NO:553711 NO2:528973 NOx:490808 CO:1236618
## SO2:499302 O3:742737 Benzene:725973 Toluene:861579 Xylene:1042366
AQ_station_hour %>% group_by(AQI_Bucket)%>%count()
## # A tibble: 7 × 2
## # Groups: AQI_Bucket [7]
## AQI_Bucket n
## <chr> <int>
## 1 Good 152113
## 2 Moderate 675008
## 3 Poor 239990
## 4 Satisfactory 530164
## 5 Severe 120468
## 6 Very Poor 301150
## 7 <NA> 570190
## Looks like Moderate entries are the highest ones but second highest is NA entries...
## Lets analyse the missing data of the dataset
n_miss(AQ_station_hour) ## Total number of missing parameters
## [1] 11664492
miss_var_summary(AQ_station_hour) ## Missingness summary
## # A tibble: 16 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 Xylene 2075104 80.1
## 2 NH3 1236618 47.8
## 3 PM10 1119252 43.2
## 4 Toluene 1042366 40.3
## 5 Benzene 861579 33.3
## 6 SO2 742737 28.7
## 7 O3 725973 28.0
## 8 PM2.5 647689 25.0
## 9 AQI 570190 22.0
## 10 AQI_Bucket 570190 22.0
## 11 NO 553711 21.4
## 12 NO2 528973 20.4
## 13 CO 499302 19.3
## 14 NOx 490808 19.0
## 15 StationId 0 0
## 16 Datetime 0 0
miss_var_span(AQ_station_hour, var = AQI, span_every = 250) ## Missingness spread
## # A tibble: 10,357 × 6
## span_counter n_miss n_complete prop_miss prop_complete n_in_span
## <int> <int> <int> <dbl> <dbl> <int>
## 1 1 43 207 0.172 0.828 250
## 2 2 0 250 0 1 250
## 3 3 0 250 0 1 250
## 4 4 0 250 0 1 250
## 5 5 0 250 0 1 250
## 6 6 0 250 0 1 250
## 7 7 0 250 0 1 250
## 8 8 0 250 0 1 250
## 9 9 0 250 0 1 250
## 10 10 8 242 0.032 0.968 250
## # ℹ 10,347 more rows
miss_var_table(AQ_station_hour)
## # A tibble: 14 × 3
## n_miss_in_var n_vars pct_vars
## <int> <int> <dbl>
## 1 0 2 12.5
## 2 490808 1 6.25
## 3 499302 1 6.25
## 4 528973 1 6.25
## 5 553711 1 6.25
## 6 570190 2 12.5
## 7 647689 1 6.25
## 8 725973 1 6.25
## 9 742737 1 6.25
## 10 861579 1 6.25
## 11 1042366 1 6.25
## 12 1119252 1 6.25
## 13 1236618 1 6.25
## 14 2075104 1 6.25
## vis_miss(AQ_station_hour) Unable to visualise % of missing due to large data size
gg_miss_upset(AQ_station_hour) ## plot for missing data
gg_miss_fct(x = AQ_station_hour, fct = AQI) ## Heat map of missingness
## Warning: Removed 15 rows containing missing values (`geom_tile()`).
gg_miss_span(AQ_station_hour, var = AQI, span_every = 250) ## Visualize span of prcp missingness
## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = AQ_station_hour, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 16 × 2
## Variable n
## <chr> <int>
## 1 StationId 0
## 2 Datetime 2589083
## 3 PM2.5 0
## 4 PM10 0
## 5 NO 0
## 6 NO2 0
## 7 NOx 0
## 8 NH3 0
## 9 CO 0
## 10 SO2 0
## 11 O3 0
## 12 Benzene 0
## 13 Toluene 0
## 14 Xylene 0
## 15 AQI 0
## 16 AQI_Bucket 301150
##Create shadow matrix data
head(as_shadow(AQ_station_hour))
## # A tibble: 6 × 16
## StationId_NA Datetime_NA PM2.5_NA PM10_NA NO_NA NO2_NA NOx_NA NH3_NA CO_NA
## <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 2 !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 3 !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 4 !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 5 !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 6 !NA !NA !NA !NA !NA !NA !NA !NA !NA
## # ℹ 7 more variables: SO2_NA <fct>, O3_NA <fct>, Benzene_NA <fct>,
## # Toluene_NA <fct>, Xylene_NA <fct>, AQI_NA <fct>, AQI_Bucket_NA <fct>
#Create nabular data by binding the shadow to the data
head(bind_shadow(AQ_station_hour, only_miss = TRUE))
## # A tibble: 6 × 30
## StationId Datetime PM2.5 PM10 NO NO2 NOx NH3 CO SO2 O3
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AP001 2017-11-24 17… 60.5 98 2.35 30.8 18.2 8.5 0.1 11.8 126.
## 2 AP001 2017-11-24 18… 65.5 111. 2.7 24.2 15.1 9.77 0.1 13.2 117.
## 3 AP001 2017-11-24 19… 80 132 2.1 25.2 15.2 12.0 0.1 12.1 99.0
## 4 AP001 2017-11-24 20… 81.5 133. 1.95 16.2 10.2 11.6 0.1 10.5 112.
## 5 AP001 2017-11-24 21… 75.2 116 1.43 17.5 10.4 12.0 0.1 9.12 106.
## 6 AP001 2017-11-24 22… 69.2 108. 0.7 18.5 10.4 13.8 0.1 9.25 91.1
## # ℹ 19 more variables: Benzene <dbl>, Toluene <dbl>, Xylene <dbl>, AQI <dbl>,
## # AQI_Bucket <chr>, PM2.5_NA <fct>, PM10_NA <fct>, NO_NA <fct>, NO2_NA <fct>,
## # NOx_NA <fct>, NH3_NA <fct>, CO_NA <fct>, SO2_NA <fct>, O3_NA <fct>,
## # Benzene_NA <fct>, Toluene_NA <fct>, Xylene_NA <fct>, AQI_NA <fct>,
## # AQI_Bucket_NA <fct>
# Lets explore the relations ship with the missing values
AQ_station_hour %>%
bind_shadow(only_miss = TRUE) %>%
group_by(AQI_NA) %>%
summarise(tCO_mean = mean(CO),CO_sd = sd(CO))
## # A tibble: 2 × 3
## AQI_NA tCO_mean CO_sd
## <fct> <dbl> <dbl>
## 1 !NA NA NA
## 2 NA NA NA
# After adding NA, there the SD and mean has also become NA
bind_shadow(AQ_station_hour) %>%
ggplot(aes(x = CO,
color = AQI_NA)) +
geom_density() +
facet_wrap(~O3_NA)
## Warning: Removed 499302 rows containing non-finite values (`stat_density()`).
# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(AQ_station_hour, aes(x = CO,y = AQI)) + geom_miss_point()
# Looks like there are not too much of missing data
# We would like to impute all the missing data with value below the range by 10%
AQ_station_hour_imp <- impute_below_all(AQ_station_hour)
ggplot(AQ_station_hour_imp, aes(x = CO, y = AQI)) + geom_miss_point()
# But we need to track the imputed values as well
AQ_station_hour_imp_track <- bind_shadow(AQ_station_hour) %>% impute_below_all()
ggplot(AQ_station_hour_imp_track, aes(x = AQI, fill = AQI_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(AQ_station_hour_imp_track, aes(x = O3, fill = O3_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(AQ_station_hour_imp_track, aes(x = CO, y = AQI, color = AQI_NA)) + geom_point()
## So we can successfully imputed all the NA values here
# Now lets fix the important the critically missing parameters prcp and lm
# via linear regression mechanism in relationship with other explanatory parameters
AQ_station_hour_imp_lm_temp <- AQ_station_hour %>% bind_shadow() %>% impute_lm(AQI ~ CO + O3) %>% impute_lm(O3 ~ CO) %>% add_label_shadow()
ggplot(AQ_station_hour_imp_lm_temp, aes(x = CO, y = AQI, color = any_missing)) + geom_miss_point()
##Analysing and Performing Imputations on AQ_station_day - station_day.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(AQ_station_day)
## [1] 108035 16
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(AQ_station_day)
## Rows: 108,035
## Columns: 16
## $ StationId <chr> "AP001", "AP001", "AP001", "AP001", "AP001", "AP001", "AP00…
## $ Date <chr> "2017-11-24", "2017-11-25", "2017-11-26", "2017-11-27", "20…
## $ PM2.5 <dbl> 71.36, 81.40, 78.32, 88.76, 64.18, 72.47, 69.80, 73.96, 89.…
## $ PM10 <dbl> 115.75, 124.50, 129.06, 135.32, 104.09, 114.84, 114.86, 113…
## $ NO <dbl> 1.75, 1.44, 1.26, 6.60, 2.56, 5.23, 4.69, 4.58, 7.71, 0.97,…
## $ NO2 <dbl> 20.65, 20.50, 26.00, 30.85, 28.07, 23.20, 20.17, 19.29, 26.…
## $ NOx <dbl> 12.40, 12.08, 14.85, 21.77, 17.01, 16.59, 14.54, 13.97, 19.…
## $ NH3 <dbl> 12.19, 10.72, 10.28, 12.91, 11.42, 12.25, 10.95, 10.95, 13.…
## $ CO <dbl> 0.10, 0.12, 0.14, 0.11, 0.09, 0.16, 0.12, 0.10, 0.10, 0.15,…
## $ SO2 <dbl> 10.76, 15.24, 26.96, 33.59, 19.00, 10.55, 14.07, 13.90, 19.…
## $ O3 <dbl> 109.26, 127.09, 117.44, 111.81, 138.18, 109.74, 118.09, 123…
## $ Benzene <dbl> 0.17, 0.20, 0.22, 0.29, 0.17, 0.21, 0.16, 0.17, 0.25, 0.23,…
## $ Toluene <dbl> 5.92, 6.50, 7.95, 7.63, 5.02, 4.71, 3.52, 2.85, 2.79, 3.82,…
## $ Xylene <dbl> 0.10, 0.06, 0.08, 0.12, 0.07, 0.08, 0.06, 0.04, 0.07, 0.04,…
## $ AQI <dbl> NA, 184, 197, 198, 188, 173, 165, 191, 191, 227, 168, 198, …
## $ AQI_Bucket <chr> "", "Moderate", "Moderate", "Moderate", "Moderate", "Modera…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(AQ_station_day)
## [1] "StationId" "Date" "PM2.5" "PM10" "NO"
## [6] "NO2" "NOx" "NH3" "CO" "SO2"
## [11] "O3" "Benzene" "Toluene" "Xylene" "AQI"
## [16] "AQI_Bucket"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(AQ_station_day)
## 'data.frame': 108035 obs. of 16 variables:
## $ StationId : chr "AP001" "AP001" "AP001" "AP001" ...
## $ Date : chr "2017-11-24" "2017-11-25" "2017-11-26" "2017-11-27" ...
## $ PM2.5 : num 71.4 81.4 78.3 88.8 64.2 ...
## $ PM10 : num 116 124 129 135 104 ...
## $ NO : num 1.75 1.44 1.26 6.6 2.56 5.23 4.69 4.58 7.71 0.97 ...
## $ NO2 : num 20.6 20.5 26 30.9 28.1 ...
## $ NOx : num 12.4 12.1 14.8 21.8 17 ...
## $ NH3 : num 12.2 10.7 10.3 12.9 11.4 ...
## $ CO : num 0.1 0.12 0.14 0.11 0.09 0.16 0.12 0.1 0.1 0.15 ...
## $ SO2 : num 10.8 15.2 27 33.6 19 ...
## $ O3 : num 109 127 117 112 138 ...
## $ Benzene : num 0.17 0.2 0.22 0.29 0.17 0.21 0.16 0.17 0.25 0.23 ...
## $ Toluene : num 5.92 6.5 7.95 7.63 5.02 4.71 3.52 2.85 2.79 3.82 ...
## $ Xylene : num 0.1 0.06 0.08 0.12 0.07 0.08 0.06 0.04 0.07 0.04 ...
## $ AQI : num NA 184 197 198 188 173 165 191 191 227 ...
## $ AQI_Bucket: chr "" "Moderate" "Moderate" "Moderate" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(AQ_station_day)
## StationId Date PM2.5 PM10
## Length:108035 Length:108035 Min. : 0.02 Min. : 0.01
## Class :character Class :character 1st Qu.: 31.88 1st Qu.: 70.15
## Mode :character Mode :character Median : 55.95 Median : 122.09
## Mean : 80.27 Mean : 157.97
## 3rd Qu.: 99.92 3rd Qu.: 208.67
## Max. :1000.00 Max. :1000.00
## NA's :21625 NA's :42706
## NO NO2 NOx NH3
## Min. : 0.01 Min. : 0.01 Min. : 0.00 Min. : 0.01
## 1st Qu.: 4.84 1st Qu.: 15.09 1st Qu.: 13.97 1st Qu.: 11.90
## Median : 10.29 Median : 27.21 Median : 26.66 Median : 23.59
## Mean : 23.12 Mean : 35.24 Mean : 41.20 Mean : 28.73
## 3rd Qu.: 24.98 3rd Qu.: 46.93 3rd Qu.: 50.50 3rd Qu.: 38.14
## Max. :470.00 Max. :448.05 Max. :467.63 Max. :418.90
## NA's :17106 NA's :16547 NA's :15500 NA's :48105
## CO SO2 O3 Benzene
## Min. : 0.000 Min. : 0.01 Min. : 0.01 Min. : 0.000
## 1st Qu.: 0.530 1st Qu.: 5.04 1st Qu.: 18.89 1st Qu.: 0.160
## Median : 0.910 Median : 8.95 Median : 30.84 Median : 1.210
## Mean : 1.606 Mean : 12.26 Mean : 38.13 Mean : 3.358
## 3rd Qu.: 1.450 3rd Qu.: 14.92 3rd Qu.: 47.14 3rd Qu.: 3.610
## Max. :175.810 Max. :195.65 Max. :963.00 Max. :455.030
## NA's :12998 NA's :25204 NA's :25568 NA's :31455
## Toluene Xylene AQI AQI_Bucket
## Min. : 0.00 Min. : 0.00 Min. : 8.0 Length:108035
## 1st Qu.: 0.69 1st Qu.: 0.00 1st Qu.: 86.0 Class :character
## Median : 4.33 Median : 0.40 Median : 132.0 Mode :character
## Mean : 15.35 Mean : 2.42 Mean : 179.7
## 3rd Qu.: 17.51 3rd Qu.: 2.11 3rd Qu.: 254.0
## Max. :454.85 Max. :170.37 Max. :2049.0
## NA's :38702 NA's :85137 NA's :21010
attach(AQ_station_day)
## The following objects are masked from AQ_station_hour:
##
## AQI, AQI_Bucket, Benzene, CO, NH3, NO, NO2, NOx, O3, PM10, PM2.5,
## SO2, StationId, Toluene, Xylene
## The following object is masked from AQ_stations:
##
## StationId
AQ_station_day [AQ_station_day == ""] <- NA
## So the air quality seems to be dependent on 12 parameters
## There are too many NAs/missing data amongst them:
## PM2.5: 21625 PM10: 42706 NO: 17106 NO2: 16547 NOx: 15500 NH3: 48105
## CO: 12998 SO2: 25204 O3: 25568 Benzene: 31455 Toluene: 38702 Xylene: 85137
AQ_station_day %>% group_by(AQI_Bucket)%>%count()
## # A tibble: 7 × 2
## # Groups: AQI_Bucket [7]
## AQI_Bucket n
## <chr> <int>
## 1 Good 5510
## 2 Moderate 29417
## 3 Poor 11493
## 4 Satisfactory 23636
## 5 Severe 5207
## 6 Very Poor 11762
## 7 <NA> 21010
## Looks like Moderate entries are the highest ones, followed by Satisfactory
## but third highest is NA entries...
AQ_station_day %>% group_by(AQI_Bucket)%>%count()
## # A tibble: 7 × 2
## # Groups: AQI_Bucket [7]
## AQI_Bucket n
## <chr> <int>
## 1 Good 5510
## 2 Moderate 29417
## 3 Poor 11493
## 4 Satisfactory 23636
## 5 Severe 5207
## 6 Very Poor 11762
## 7 <NA> 21010
## Looks like Moderate entries are the highest ones but second highest is NA entries...
## Lets analyse the missing data of the dataset
n_miss(AQ_station_day) ## Total number of missing parameters
## [1] 422673
miss_var_summary(AQ_station_day) ## Missingness summary
## # A tibble: 16 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 Xylene 85137 78.8
## 2 NH3 48105 44.5
## 3 PM10 42706 39.5
## 4 Toluene 38702 35.8
## 5 Benzene 31455 29.1
## 6 O3 25568 23.7
## 7 SO2 25204 23.3
## 8 PM2.5 21625 20.0
## 9 AQI 21010 19.4
## 10 AQI_Bucket 21010 19.4
## 11 NO 17106 15.8
## 12 NO2 16547 15.3
## 13 NOx 15500 14.3
## 14 CO 12998 12.0
## 15 StationId 0 0
## 16 Date 0 0
miss_var_span(AQ_station_day, var = AQI, span_every = 250) ## Missingness spread
## # A tibble: 433 × 6
## span_counter n_miss n_complete prop_miss prop_complete n_in_span
## <int> <int> <int> <dbl> <dbl> <int>
## 1 1 22 228 0.088 0.912 250
## 2 2 32 218 0.128 0.872 250
## 3 3 56 194 0.224 0.776 250
## 4 4 9 241 0.036 0.964 250
## 5 5 26 224 0.104 0.896 250
## 6 6 188 62 0.752 0.248 250
## 7 7 24 226 0.096 0.904 250
## 8 8 19 231 0.076 0.924 250
## 9 9 20 230 0.08 0.92 250
## 10 10 9 241 0.036 0.964 250
## # ℹ 423 more rows
miss_var_table(AQ_station_day)
## # A tibble: 14 × 3
## n_miss_in_var n_vars pct_vars
## <int> <int> <dbl>
## 1 0 2 12.5
## 2 12998 1 6.25
## 3 15500 1 6.25
## 4 16547 1 6.25
## 5 17106 1 6.25
## 6 21010 2 12.5
## 7 21625 1 6.25
## 8 25204 1 6.25
## 9 25568 1 6.25
## 10 31455 1 6.25
## 11 38702 1 6.25
## 12 42706 1 6.25
## 13 48105 1 6.25
## 14 85137 1 6.25
## vis_miss(AQ_station_day) Unable to visualise % of missing due to large data size
gg_miss_upset(AQ_station_day) ## plot for missing data
gg_miss_fct(x = AQ_station_day, fct = AQI) ## Heat map of missingness
## Warning: Removed 15 rows containing missing values (`geom_tile()`).
gg_miss_span(AQ_station_day, var = AQI, span_every = 250) ## Visualize span of prcp missingness
## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = AQ_station_day, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 16 × 2
## Variable n
## <chr> <int>
## 1 StationId 0
## 2 Date 0
## 3 PM2.5 0
## 4 PM10 0
## 5 NO 0
## 6 NO2 0
## 7 NOx 0
## 8 NH3 0
## 9 CO 0
## 10 SO2 0
## 11 O3 0
## 12 Benzene 0
## 13 Toluene 0
## 14 Xylene 0
## 15 AQI 0
## 16 AQI_Bucket 11762
##Create shadow matrix data
head(as_shadow(AQ_station_day))
## # A tibble: 6 × 16
## StationId_NA Date_NA PM2.5_NA PM10_NA NO_NA NO2_NA NOx_NA NH3_NA CO_NA SO2_NA
## <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 2 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 3 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 4 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 5 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 6 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## # ℹ 6 more variables: O3_NA <fct>, Benzene_NA <fct>, Toluene_NA <fct>,
## # Xylene_NA <fct>, AQI_NA <fct>, AQI_Bucket_NA <fct>
#Create nabular data by binding the shadow to the data
head(bind_shadow(AQ_station_day, only_miss = TRUE))
## # A tibble: 6 × 30
## StationId Date PM2.5 PM10 NO NO2 NOx NH3 CO SO2 O3 Benzene
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AP001 2017-… 71.4 116. 1.75 20.6 12.4 12.2 0.1 10.8 109. 0.17
## 2 AP001 2017-… 81.4 124. 1.44 20.5 12.1 10.7 0.12 15.2 127. 0.2
## 3 AP001 2017-… 78.3 129. 1.26 26 14.8 10.3 0.14 27.0 117. 0.22
## 4 AP001 2017-… 88.8 135. 6.6 30.8 21.8 12.9 0.11 33.6 112. 0.29
## 5 AP001 2017-… 64.2 104. 2.56 28.1 17.0 11.4 0.09 19 138. 0.17
## 6 AP001 2017-… 72.5 115. 5.23 23.2 16.6 12.2 0.16 10.6 110. 0.21
## # ℹ 18 more variables: Toluene <dbl>, Xylene <dbl>, AQI <dbl>,
## # AQI_Bucket <chr>, PM2.5_NA <fct>, PM10_NA <fct>, NO_NA <fct>, NO2_NA <fct>,
## # NOx_NA <fct>, NH3_NA <fct>, CO_NA <fct>, SO2_NA <fct>, O3_NA <fct>,
## # Benzene_NA <fct>, Toluene_NA <fct>, Xylene_NA <fct>, AQI_NA <fct>,
## # AQI_Bucket_NA <fct>
# Lets explore the relations ship with the missing values
AQ_station_day %>%
bind_shadow(only_miss = TRUE) %>%
group_by(AQI_NA) %>%
summarise(tCO_mean = mean(CO),CO_sd = sd(CO))
## # A tibble: 2 × 3
## AQI_NA tCO_mean CO_sd
## <fct> <dbl> <dbl>
## 1 !NA NA NA
## 2 NA NA NA
# After adding NA, there the SD and mean has also become NA
bind_shadow(AQ_station_day) %>%
ggplot(aes(x = CO,
color = AQI_NA)) +
geom_density() +
facet_wrap(~O3_NA)
## Warning: Removed 12998 rows containing non-finite values (`stat_density()`).
# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(AQ_station_day, aes(x = CO,y = AQI)) + geom_miss_point()
# Looks like there are not too much of missing data
# We would like to impute all the missing data with value below the range by 10%
AQ_station_day_imp <- impute_below_all(AQ_station_day)
ggplot(AQ_station_day_imp, aes(x = CO, y = AQI)) + geom_miss_point()
# But we need to track the imputed values as well
AQ_station_day_imp_track <- bind_shadow(AQ_station_day) %>% impute_below_all()
ggplot(AQ_station_day_imp_track, aes(x = AQI, fill = AQI_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(AQ_station_day_imp_track, aes(x = O3, fill = O3_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(AQ_station_day_imp_track, aes(x = CO, y = AQI, color = AQI_NA)) + geom_point()
## So we can successfully imputed all the NA values here
# Now lets fix the important the critically missing parameters prcp and lm
# via linear regression mechanism in relationship with other explanatory parameters
AQ_station_day_imp_lm_temp <- AQ_station_day %>% bind_shadow() %>% impute_lm(AQI ~ CO + O3) %>% impute_lm(O3 ~ CO) %>% add_label_shadow()
ggplot(AQ_station_day_imp_lm_temp, aes(x = CO, y = AQI, color = any_missing)) + geom_miss_point()
##Analysing and Performing Imputations on AQ_city_day <- read.csv(“./datasets/city_day.csv”)
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(AQ_city_day)
## [1] 29531 16
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(AQ_city_day)
## Rows: 29,531
## Columns: 16
## $ City <chr> "Ahmedabad", "Ahmedabad", "Ahmedabad", "Ahmedabad", "Ahmeda…
## $ Date <chr> "2015-01-01", "2015-01-02", "2015-01-03", "2015-01-04", "20…
## $ PM2.5 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ PM10 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ NO <dbl> 0.92, 0.97, 17.40, 1.70, 22.10, 45.41, 112.16, 80.87, 29.16…
## $ NO2 <dbl> 18.22, 15.69, 19.30, 18.48, 21.42, 38.48, 40.62, 36.74, 31.…
## $ NOx <dbl> 17.15, 16.46, 29.70, 17.97, 37.76, 81.50, 130.77, 96.75, 48…
## $ NH3 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ CO <dbl> 0.92, 0.97, 17.40, 1.70, 22.10, 45.41, 112.16, 80.87, 29.16…
## $ SO2 <dbl> 27.64, 24.55, 29.07, 18.59, 39.33, 45.76, 32.28, 38.54, 58.…
## $ O3 <dbl> 133.36, 34.06, 30.70, 36.08, 39.31, 46.51, 33.47, 31.89, 25…
## $ Benzene <dbl> 0.00, 3.68, 6.80, 4.43, 7.01, 5.42, 0.00, 0.00, 0.00, 0.00,…
## $ Toluene <dbl> 0.02, 5.50, 16.40, 10.14, 18.89, 10.83, 0.00, 0.00, 0.00, 0…
## $ Xylene <dbl> 0.00, 3.77, 2.25, 1.00, 2.78, 1.93, 0.00, 0.00, 0.00, 0.00,…
## $ AQI <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ AQI_Bucket <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(AQ_city_day)
## [1] "City" "Date" "PM2.5" "PM10" "NO"
## [6] "NO2" "NOx" "NH3" "CO" "SO2"
## [11] "O3" "Benzene" "Toluene" "Xylene" "AQI"
## [16] "AQI_Bucket"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(AQ_city_day)
## 'data.frame': 29531 obs. of 16 variables:
## $ City : chr "Ahmedabad" "Ahmedabad" "Ahmedabad" "Ahmedabad" ...
## $ Date : chr "2015-01-01" "2015-01-02" "2015-01-03" "2015-01-04" ...
## $ PM2.5 : num NA NA NA NA NA NA NA NA NA NA ...
## $ PM10 : num NA NA NA NA NA NA NA NA NA NA ...
## $ NO : num 0.92 0.97 17.4 1.7 22.1 ...
## $ NO2 : num 18.2 15.7 19.3 18.5 21.4 ...
## $ NOx : num 17.1 16.5 29.7 18 37.8 ...
## $ NH3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ CO : num 0.92 0.97 17.4 1.7 22.1 ...
## $ SO2 : num 27.6 24.6 29.1 18.6 39.3 ...
## $ O3 : num 133.4 34.1 30.7 36.1 39.3 ...
## $ Benzene : num 0 3.68 6.8 4.43 7.01 5.42 0 0 0 0 ...
## $ Toluene : num 0.02 5.5 16.4 10.14 18.89 ...
## $ Xylene : num 0 3.77 2.25 1 2.78 1.93 0 0 0 0 ...
## $ AQI : num NA NA NA NA NA NA NA NA NA NA ...
## $ AQI_Bucket: chr "" "" "" "" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(AQ_city_day)
## City Date PM2.5 PM10
## Length:29531 Length:29531 Min. : 0.04 Min. : 0.01
## Class :character Class :character 1st Qu.: 28.82 1st Qu.: 56.26
## Mode :character Mode :character Median : 48.57 Median : 95.68
## Mean : 67.45 Mean : 118.13
## 3rd Qu.: 80.59 3rd Qu.: 149.75
## Max. :949.99 Max. :1000.00
## NA's :4598 NA's :11140
## NO NO2 NOx NH3
## Min. : 0.02 Min. : 0.01 Min. : 0.00 Min. : 0.01
## 1st Qu.: 5.63 1st Qu.: 11.75 1st Qu.: 12.82 1st Qu.: 8.58
## Median : 9.89 Median : 21.69 Median : 23.52 Median : 15.85
## Mean : 17.57 Mean : 28.56 Mean : 32.31 Mean : 23.48
## 3rd Qu.: 19.95 3rd Qu.: 37.62 3rd Qu.: 40.13 3rd Qu.: 30.02
## Max. :390.68 Max. :362.21 Max. :467.63 Max. :352.89
## NA's :3582 NA's :3585 NA's :4185 NA's :10328
## CO SO2 O3 Benzene
## Min. : 0.000 Min. : 0.01 Min. : 0.01 Min. : 0.000
## 1st Qu.: 0.510 1st Qu.: 5.67 1st Qu.: 18.86 1st Qu.: 0.120
## Median : 0.890 Median : 9.16 Median : 30.84 Median : 1.070
## Mean : 2.249 Mean : 14.53 Mean : 34.49 Mean : 3.281
## 3rd Qu.: 1.450 3rd Qu.: 15.22 3rd Qu.: 45.57 3rd Qu.: 3.080
## Max. :175.810 Max. :193.86 Max. :257.73 Max. :455.030
## NA's :2059 NA's :3854 NA's :4022 NA's :5623
## Toluene Xylene AQI AQI_Bucket
## Min. : 0.000 Min. : 0.00 Min. : 13.0 Length:29531
## 1st Qu.: 0.600 1st Qu.: 0.14 1st Qu.: 81.0 Class :character
## Median : 2.970 Median : 0.98 Median : 118.0 Mode :character
## Mean : 8.701 Mean : 3.07 Mean : 166.5
## 3rd Qu.: 9.150 3rd Qu.: 3.35 3rd Qu.: 208.0
## Max. :454.850 Max. :170.37 Max. :2049.0
## NA's :8041 NA's :18109 NA's :4681
attach(AQ_city_day)
## The following objects are masked from AQ_station_day:
##
## AQI, AQI_Bucket, Benzene, CO, Date, NH3, NO, NO2, NOx, O3, PM10,
## PM2.5, SO2, Toluene, Xylene
## The following objects are masked from AQ_station_hour:
##
## AQI, AQI_Bucket, Benzene, CO, NH3, NO, NO2, NOx, O3, PM10, PM2.5,
## SO2, Toluene, Xylene
## The following object is masked from AQ_stations:
##
## City
AQ_city_day [AQ_city_day == ""] <- NA
## So the air quality seems to be dependent on 12 parameters
## There are too many NAs/missing data amongst them:
## PM2.5: 4598 PM10: 11140 NO: 3582 NO2: 3585 NOx: 4185 NH3: 10328
## CO: 2059 SO2: 3854 O3: 4022 Benzene: 5623 Toluene: 8041 Xylene: 18109
AQ_city_day %>% group_by(AQI_Bucket)%>%count()
## # A tibble: 7 × 2
## # Groups: AQI_Bucket [7]
## AQI_Bucket n
## <chr> <int>
## 1 Good 1341
## 2 Moderate 8829
## 3 Poor 2781
## 4 Satisfactory 8224
## 5 Severe 1338
## 6 Very Poor 2337
## 7 <NA> 4681
## Looks like Moderate entries are the highest ones, followed by Satisfactory
## but third highest is NA entries...
AQ_city_day %>% group_by(AQI_Bucket)%>%count()
## # A tibble: 7 × 2
## # Groups: AQI_Bucket [7]
## AQI_Bucket n
## <chr> <int>
## 1 Good 1341
## 2 Moderate 8829
## 3 Poor 2781
## 4 Satisfactory 8224
## 5 Severe 1338
## 6 Very Poor 2337
## 7 <NA> 4681
## Looks like Moderate entries are the highest ones but second highest is NA entries...
## Lets analyse the missing data of the dataset
n_miss(AQ_city_day) ## Total number of missing parameters
## [1] 88488
miss_var_summary(AQ_city_day) ## Missingness summary
## # A tibble: 16 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 Xylene 18109 61.3
## 2 PM10 11140 37.7
## 3 NH3 10328 35.0
## 4 Toluene 8041 27.2
## 5 Benzene 5623 19.0
## 6 AQI 4681 15.9
## 7 AQI_Bucket 4681 15.9
## 8 PM2.5 4598 15.6
## 9 NOx 4185 14.2
## 10 O3 4022 13.6
## 11 SO2 3854 13.1
## 12 NO2 3585 12.1
## 13 NO 3582 12.1
## 14 CO 2059 6.97
## 15 City 0 0
## 16 Date 0 0
miss_var_span(AQ_city_day, var = AQI, span_every = 250) ## Missingness spread
## # A tibble: 119 × 6
## span_counter n_miss n_complete prop_miss prop_complete n_in_span
## <int> <int> <int> <dbl> <dbl> <int>
## 1 1 41 209 0.164 0.836 250
## 2 2 196 54 0.784 0.216 250
## 3 3 133 117 0.532 0.468 250
## 4 4 250 0 1 0 250
## 5 5 32 218 0.128 0.872 250
## 6 6 3 247 0.012 0.988 250
## 7 7 13 237 0.052 0.948 250
## 8 8 7 243 0.028 0.972 250
## 9 9 4 246 0.016 0.984 250
## 10 10 49 201 0.196 0.804 250
## # ℹ 109 more rows
miss_var_table(AQ_city_day)
## # A tibble: 14 × 3
## n_miss_in_var n_vars pct_vars
## <int> <int> <dbl>
## 1 0 2 12.5
## 2 2059 1 6.25
## 3 3582 1 6.25
## 4 3585 1 6.25
## 5 3854 1 6.25
## 6 4022 1 6.25
## 7 4185 1 6.25
## 8 4598 1 6.25
## 9 4681 2 12.5
## 10 5623 1 6.25
## 11 8041 1 6.25
## 12 10328 1 6.25
## 13 11140 1 6.25
## 14 18109 1 6.25
## vis_miss(AQ_city_day) Unable to visualise % of missing due to large data size
gg_miss_upset(AQ_city_day) ## plot for missing data
gg_miss_fct(x = AQ_city_day, fct = AQI) ## Heat map of missingness
## Warning: Removed 15 rows containing missing values (`geom_tile()`).
gg_miss_span(AQ_city_day, var = AQI, span_every = 250) ## Visualize span of prcp missingness
## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = AQ_city_day, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 16 × 2
## Variable n
## <chr> <int>
## 1 City 7541
## 2 Date 0
## 3 PM2.5 0
## 4 PM10 0
## 5 NO 0
## 6 NO2 0
## 7 NOx 0
## 8 NH3 0
## 9 CO 0
## 10 SO2 0
## 11 O3 0
## 12 Benzene 0
## 13 Toluene 0
## 14 Xylene 0
## 15 AQI 0
## 16 AQI_Bucket 2337
##Create shadow matrix data
head(as_shadow(AQ_city_day))
## # A tibble: 6 × 16
## City_NA Date_NA PM2.5_NA PM10_NA NO_NA NO2_NA NOx_NA NH3_NA CO_NA SO2_NA O3_NA
## <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 !NA !NA NA NA !NA !NA !NA NA !NA !NA !NA
## 2 !NA !NA NA NA !NA !NA !NA NA !NA !NA !NA
## 3 !NA !NA NA NA !NA !NA !NA NA !NA !NA !NA
## 4 !NA !NA NA NA !NA !NA !NA NA !NA !NA !NA
## 5 !NA !NA NA NA !NA !NA !NA NA !NA !NA !NA
## 6 !NA !NA NA NA !NA !NA !NA NA !NA !NA !NA
## # ℹ 5 more variables: Benzene_NA <fct>, Toluene_NA <fct>, Xylene_NA <fct>,
## # AQI_NA <fct>, AQI_Bucket_NA <fct>
#Create nabular data by binding the shadow to the data
head(bind_shadow(AQ_city_day, only_miss = TRUE))
## # A tibble: 6 × 30
## City Date PM2.5 PM10 NO NO2 NOx NH3 CO SO2 O3 Benzene
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Ahmedabad 2015-… NA NA 0.92 18.2 17.2 NA 0.92 27.6 133. 0
## 2 Ahmedabad 2015-… NA NA 0.97 15.7 16.5 NA 0.97 24.6 34.1 3.68
## 3 Ahmedabad 2015-… NA NA 17.4 19.3 29.7 NA 17.4 29.1 30.7 6.8
## 4 Ahmedabad 2015-… NA NA 1.7 18.5 18.0 NA 1.7 18.6 36.1 4.43
## 5 Ahmedabad 2015-… NA NA 22.1 21.4 37.8 NA 22.1 39.3 39.3 7.01
## 6 Ahmedabad 2015-… NA NA 45.4 38.5 81.5 NA 45.4 45.8 46.5 5.42
## # ℹ 18 more variables: Toluene <dbl>, Xylene <dbl>, AQI <dbl>,
## # AQI_Bucket <chr>, PM2.5_NA <fct>, PM10_NA <fct>, NO_NA <fct>, NO2_NA <fct>,
## # NOx_NA <fct>, NH3_NA <fct>, CO_NA <fct>, SO2_NA <fct>, O3_NA <fct>,
## # Benzene_NA <fct>, Toluene_NA <fct>, Xylene_NA <fct>, AQI_NA <fct>,
## # AQI_Bucket_NA <fct>
# Lets explore the relations ship with the missing values
AQ_city_day %>%
bind_shadow(only_miss = TRUE) %>%
group_by(AQI_NA) %>%
summarise(tCO_mean = mean(CO),CO_sd = sd(CO))
## # A tibble: 2 × 3
## AQI_NA tCO_mean CO_sd
## <fct> <dbl> <dbl>
## 1 !NA NA NA
## 2 NA NA NA
# After adding NA, there the SD and mean has also become NA
bind_shadow(AQ_city_day) %>%
ggplot(aes(x = CO,
color = AQI_NA)) +
geom_density() +
facet_wrap(~O3_NA)
## Warning: Removed 2059 rows containing non-finite values (`stat_density()`).
# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(AQ_city_day, aes(x = CO,y = AQI)) + geom_miss_point()
# Looks like there are not too much of missing data
# We would like to impute all the missing data with value below the range by 10%
AQ_city_day_imp <- impute_below_all(AQ_city_day)
ggplot(AQ_city_day_imp, aes(x = CO, y = AQI)) + geom_miss_point()
# But we need to track the imputed values as well
AQ_city_day_imp_track <- bind_shadow(AQ_city_day) %>% impute_below_all()
ggplot(AQ_city_day_imp_track, aes(x = AQI, fill = AQI_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(AQ_city_day_imp_track, aes(x = O3, fill = O3_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(AQ_city_day_imp_track, aes(x = CO, y = AQI, color = AQI_NA)) + geom_point()
## So we can successfully imputed all the NA values here
# Now lets fix the important the critically missing parameters prcp and lm
# via linear regression mechanism in relationship with other explanatory parameters
AQ_city_day_imp_lm_temp <- AQ_city_day %>% bind_shadow() %>% impute_lm(AQI ~ CO + O3) %>% impute_lm(O3 ~ CO) %>% add_label_shadow()
ggplot(AQ_city_day_imp_lm_temp, aes(x = CO, y = AQI, color = any_missing)) + geom_miss_point()
##Analysing AQ_city_hour: city_hour.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(AQ_city_hour)
## [1] 707875 16
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(AQ_city_hour)
## Rows: 707,875
## Columns: 16
## $ City <chr> "Ahmedabad", "Ahmedabad", "Ahmedabad", "Ahmedabad", "Ahmeda…
## $ Datetime <chr> "2015-01-01 01:00:00", "2015-01-01 02:00:00", "2015-01-01 0…
## $ PM2.5 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ PM10 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ NO <dbl> 1.00, 0.02, 0.08, 0.30, 0.12, 0.33, 0.45, 1.03, 1.47, 2.05,…
## $ NO2 <dbl> 40.01, 27.75, 19.32, 16.45, 14.90, 15.95, 15.94, 16.66, 16.…
## $ NOx <dbl> 36.37, 19.73, 11.08, 9.20, 7.85, 10.82, 12.47, 16.48, 18.02…
## $ NH3 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ CO <dbl> 1.00, 0.02, 0.08, 0.30, 0.12, 0.33, 0.45, 1.03, 1.47, 2.05,…
## $ SO2 <dbl> 122.07, 85.90, 52.83, 39.53, 32.63, 29.87, 27.41, 20.92, 16…
## $ O3 <dbl> NA, NA, NA, 153.58, NA, 64.25, 191.96, 177.21, 122.08, NA, …
## $ Benzene <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Toluene <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,…
## $ Xylene <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ AQI <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ AQI_Bucket <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(AQ_city_hour)
## [1] "City" "Datetime" "PM2.5" "PM10" "NO"
## [6] "NO2" "NOx" "NH3" "CO" "SO2"
## [11] "O3" "Benzene" "Toluene" "Xylene" "AQI"
## [16] "AQI_Bucket"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(AQ_city_hour)
## 'data.frame': 707875 obs. of 16 variables:
## $ City : chr "Ahmedabad" "Ahmedabad" "Ahmedabad" "Ahmedabad" ...
## $ Datetime : chr "2015-01-01 01:00:00" "2015-01-01 02:00:00" "2015-01-01 03:00:00" "2015-01-01 04:00:00" ...
## $ PM2.5 : num NA NA NA NA NA NA NA NA NA NA ...
## $ PM10 : num NA NA NA NA NA NA NA NA NA NA ...
## $ NO : num 1 0.02 0.08 0.3 0.12 0.33 0.45 1.03 1.47 2.05 ...
## $ NO2 : num 40 27.8 19.3 16.4 14.9 ...
## $ NOx : num 36.37 19.73 11.08 9.2 7.85 ...
## $ NH3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ CO : num 1 0.02 0.08 0.3 0.12 0.33 0.45 1.03 1.47 2.05 ...
## $ SO2 : num 122.1 85.9 52.8 39.5 32.6 ...
## $ O3 : num NA NA NA 154 NA ...
## $ Benzene : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Toluene : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Xylene : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AQI : num NA NA NA NA NA NA NA NA NA NA ...
## $ AQI_Bucket: chr "" "" "" "" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(AQ_city_hour)
## City Datetime PM2.5 PM10
## Length:707875 Length:707875 Min. : 0.01 Min. : 0.01
## Class :character Class :character 1st Qu.: 26.20 1st Qu.: 52.38
## Mode :character Mode :character Median : 46.42 Median : 91.50
## Mean : 67.62 Mean : 119.08
## 3rd Qu.: 79.49 3rd Qu.: 147.52
## Max. : 999.99 Max. :1000.00
## NA's :145088 NA's :296737
## NO NO2 NOx NH3
## Min. : 0.01 Min. : 0.01 Min. : 0.00 Min. : 0.01
## 1st Qu.: 3.84 1st Qu.: 10.81 1st Qu.: 10.66 1st Qu.: 8.12
## Median : 7.96 Median : 20.32 Median : 20.79 Median : 15.38
## Mean : 17.42 Mean : 28.89 Mean : 32.29 Mean : 23.61
## 3rd Qu.: 16.15 3rd Qu.: 36.35 3rd Qu.: 37.15 3rd Qu.: 29.23
## Max. :499.99 Max. :499.51 Max. :498.61 Max. :499.97
## NA's :116632 NA's :117122 NA's :123224 NA's :272542
## CO SO2 O3 Benzene
## Min. : 0.00 Min. : 0.01 Min. : 0.01 Min. : 0.00
## 1st Qu.: 0.42 1st Qu.: 4.88 1st Qu.: 13.42 1st Qu.: 0.05
## Median : 0.80 Median : 8.37 Median : 26.24 Median : 0.86
## Mean : 2.18 Mean : 14.04 Mean : 34.80 Mean : 3.09
## 3rd Qu.: 1.37 3rd Qu.: 14.78 3rd Qu.: 47.62 3rd Qu.: 2.75
## Max. :498.57 Max. :199.96 Max. :497.62 Max. :498.07
## NA's :86517 NA's :130373 NA's :129208 NA's :163646
## Toluene Xylene AQI AQI_Bucket
## Min. : 0.00 Min. : 0.0 Min. : 8.0 Length:707875
## 1st Qu.: 0.37 1st Qu.: 0.1 1st Qu.: 79.0 Class :character
## Median : 2.59 Median : 0.8 Median : 116.0 Mode :character
## Mean : 8.66 Mean : 3.1 Mean : 166.4
## 3rd Qu.: 8.41 3rd Qu.: 3.1 3rd Qu.: 208.0
## Max. :499.40 Max. :500.0 Max. :3133.0
## NA's :220607 NA's :455829 NA's :129080
attach(AQ_city_hour)
## The following objects are masked from AQ_city_day:
##
## AQI, AQI_Bucket, Benzene, CO, City, NH3, NO, NO2, NOx, O3, PM10,
## PM2.5, SO2, Toluene, Xylene
## The following objects are masked from AQ_station_day:
##
## AQI, AQI_Bucket, Benzene, CO, NH3, NO, NO2, NOx, O3, PM10, PM2.5,
## SO2, Toluene, Xylene
## The following objects are masked from AQ_station_hour:
##
## AQI, AQI_Bucket, Benzene, CO, Datetime, NH3, NO, NO2, NOx, O3,
## PM10, PM2.5, SO2, Toluene, Xylene
## The following object is masked from AQ_stations:
##
## City
AQ_city_hour [AQ_city_hour == ""] <- NA
## So the air quality seems to be dependent on 12 parameters
## There are too many NAs/missing data amongst them:
## PM2.5: 145088 PM10: 296737 NO: 116632 NO2: 117122 NOx: 123224 NH3: 272542
## CO: 86517 SO2: 130373 O3: 129208 Benzene: 163646 Toluene: 220607 Xylene: 455829
AQ_city_hour %>% group_by(AQI_Bucket)%>%count()
## # A tibble: 7 × 2
## # Groups: AQI_Bucket [7]
## AQI_Bucket n
## <chr> <int>
## 1 Good 38611
## 2 Moderate 198991
## 3 Poor 66654
## 4 Satisfactory 189434
## 5 Severe 27650
## 6 Very Poor 57455
## 7 <NA> 129080
## Looks like Moderate entries are the highest ones, followed by Satisfactory
## but third highest is NA entries...
n_miss(AQ_city_hour) ## Total number of missing parameters
## [1] 2515685
miss_var_summary(AQ_city_hour) ## Missingness summary
## # A tibble: 16 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 Xylene 455829 64.4
## 2 PM10 296737 41.9
## 3 NH3 272542 38.5
## 4 Toluene 220607 31.2
## 5 Benzene 163646 23.1
## 6 PM2.5 145088 20.5
## 7 SO2 130373 18.4
## 8 O3 129208 18.3
## 9 AQI 129080 18.2
## 10 AQI_Bucket 129080 18.2
## 11 NOx 123224 17.4
## 12 NO2 117122 16.5
## 13 NO 116632 16.5
## 14 CO 86517 12.2
## 15 City 0 0
## 16 Datetime 0 0
miss_var_span(AQ_city_hour, var = AQI, span_every = 250) ## Missingness spread
## # A tibble: 2,832 × 6
## span_counter n_miss n_complete prop_miss prop_complete n_in_span
## <int> <int> <int> <dbl> <dbl> <int>
## 1 1 250 0 1 0 250
## 2 2 250 0 1 0 250
## 3 3 180 70 0.72 0.28 250
## 4 4 0 250 0 1 250
## 5 5 0 250 0 1 250
## 6 6 102 148 0.408 0.592 250
## 7 7 54 196 0.216 0.784 250
## 8 8 0 250 0 1 250
## 9 9 0 250 0 1 250
## 10 10 74 176 0.296 0.704 250
## # ℹ 2,822 more rows
miss_var_table(AQ_city_hour)
## # A tibble: 14 × 3
## n_miss_in_var n_vars pct_vars
## <int> <int> <dbl>
## 1 0 2 12.5
## 2 86517 1 6.25
## 3 116632 1 6.25
## 4 117122 1 6.25
## 5 123224 1 6.25
## 6 129080 2 12.5
## 7 129208 1 6.25
## 8 130373 1 6.25
## 9 145088 1 6.25
## 10 163646 1 6.25
## 11 220607 1 6.25
## 12 272542 1 6.25
## 13 296737 1 6.25
## 14 455829 1 6.25
## vis_miss(AQ_city_hour) Unable to visualise % of missing due to large data size
gg_miss_upset(AQ_city_hour) ## plot for missing data
gg_miss_fct(x = AQ_city_hour, fct = AQI) ## Heat map of missingness
## Warning: Removed 15 rows containing missing values (`geom_tile()`).
gg_miss_span(AQ_city_hour, var = AQI, span_every = 250) ## Visualize span of prcp missingness
## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = AQ_city_hour, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 16 × 2
## Variable n
## <chr> <int>
## 1 City 180770
## 2 Datetime 707875
## 3 PM2.5 0
## 4 PM10 0
## 5 NO 0
## 6 NO2 0
## 7 NOx 0
## 8 NH3 0
## 9 CO 0
## 10 SO2 0
## 11 O3 0
## 12 Benzene 0
## 13 Toluene 0
## 14 Xylene 0
## 15 AQI 0
## 16 AQI_Bucket 57455
##Create shadow matrix data
head(as_shadow(AQ_city_hour))
## # A tibble: 6 × 16
## City_NA Datetime_NA PM2.5_NA PM10_NA NO_NA NO2_NA NOx_NA NH3_NA CO_NA SO2_NA
## <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 !NA !NA NA NA !NA !NA !NA NA !NA !NA
## 2 !NA !NA NA NA !NA !NA !NA NA !NA !NA
## 3 !NA !NA NA NA !NA !NA !NA NA !NA !NA
## 4 !NA !NA NA NA !NA !NA !NA NA !NA !NA
## 5 !NA !NA NA NA !NA !NA !NA NA !NA !NA
## 6 !NA !NA NA NA !NA !NA !NA NA !NA !NA
## # ℹ 6 more variables: O3_NA <fct>, Benzene_NA <fct>, Toluene_NA <fct>,
## # Xylene_NA <fct>, AQI_NA <fct>, AQI_Bucket_NA <fct>
#Create nabular data by binding the shadow to the data
head(bind_shadow(AQ_city_hour, only_miss = TRUE))
## # A tibble: 6 × 30
## City Datetime PM2.5 PM10 NO NO2 NOx NH3 CO SO2 O3 Benzene
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Ahmeda… 2015-01… NA NA 1 40.0 36.4 NA 1 122. NA 0
## 2 Ahmeda… 2015-01… NA NA 0.02 27.8 19.7 NA 0.02 85.9 NA 0
## 3 Ahmeda… 2015-01… NA NA 0.08 19.3 11.1 NA 0.08 52.8 NA 0
## 4 Ahmeda… 2015-01… NA NA 0.3 16.4 9.2 NA 0.3 39.5 154. 0
## 5 Ahmeda… 2015-01… NA NA 0.12 14.9 7.85 NA 0.12 32.6 NA 0
## 6 Ahmeda… 2015-01… NA NA 0.33 16.0 10.8 NA 0.33 29.9 64.2 0
## # ℹ 18 more variables: Toluene <dbl>, Xylene <dbl>, AQI <dbl>,
## # AQI_Bucket <chr>, PM2.5_NA <fct>, PM10_NA <fct>, NO_NA <fct>, NO2_NA <fct>,
## # NOx_NA <fct>, NH3_NA <fct>, CO_NA <fct>, SO2_NA <fct>, O3_NA <fct>,
## # Benzene_NA <fct>, Toluene_NA <fct>, Xylene_NA <fct>, AQI_NA <fct>,
## # AQI_Bucket_NA <fct>
# Lets explore the relations ship with the missing values
AQ_city_hour %>%
bind_shadow(only_miss = TRUE) %>%
group_by(AQI_NA) %>%
summarise(tCO_mean = mean(CO),CO_sd = sd(CO))
## # A tibble: 2 × 3
## AQI_NA tCO_mean CO_sd
## <fct> <dbl> <dbl>
## 1 !NA NA NA
## 2 NA NA NA
# After adding NA, there the SD and mean has also become NA
bind_shadow(AQ_city_hour) %>%
ggplot(aes(x = CO,
color = AQI_NA)) +
geom_density() +
facet_wrap(~O3_NA)
## Warning: Removed 86517 rows containing non-finite values (`stat_density()`).
# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(AQ_city_hour, aes(x = CO,y = AQI)) + geom_miss_point()
# Looks like there are not too much of missing data
# We would like to impute all the missing data with value below the range by 10%
AQ_city_hour_imp <- impute_below_all(AQ_city_hour)
ggplot(AQ_city_hour_imp, aes(x = CO, y = AQI)) + geom_miss_point()
# But we need to track the imputed values as well
AQ_city_hour_imp_track <- bind_shadow(AQ_city_hour) %>% impute_below_all()
ggplot(AQ_city_hour_imp_track, aes(x = AQI, fill = AQI_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(AQ_city_hour_imp_track, aes(x = O3, fill = O3_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(AQ_city_hour_imp_track, aes(x = CO, y = AQI, color = AQI_NA)) + geom_point()
## So we can successfully imputed all the NA values here
# Now lets fix the important the critically missing parameters prcp and lm
# via linear regression mechanism in relationship with other explanatory parameters
AQ_city_hour_imp_lm_temp <- AQ_city_hour %>% bind_shadow() %>% impute_lm(AQI ~ CO + O3) %>% impute_lm(O3 ~ CO) %>% add_label_shadow()
ggplot(AQ_city_hour_imp_lm_temp, aes(x = CO, y = AQI, color = any_missing)) + geom_miss_point()
##Analysing Airport_delay: Aiport_Delay.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Airport_delay)
## [1] 14952 22
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Airport_delay)
## Rows: 14,952
## Columns: 22
## $ Date <chr> "28-1-18", "28-1-18", …
## $ Departure.Airport <chr> "BLR", "CCU", "DEL", "…
## $ Departure.Airport.Rating..out.of.10. <dbl> NA, NA, 7.99, 7.29, NA…
## $ Departure.Airport.On.Time.Rating..out.of.10. <dbl> NA, NA, 7.3, 6.2, NA, …
## $ Departure.Airport.Service.Rating..out.of.10. <dbl> NA, NA, 9.1, 9.0, NA, …
## $ Arrival.Airport <chr> "DEL", "DEL", "HYD", "…
## $ Arrival.Airport.Rating..out.of.10. <dbl> 7.99, 7.99, 8.27, 7.99…
## $ Arrival.Airport.On.Time.Rating..out.of.10. <dbl> 7.3, 7.3, 7.8, 7.3, 6.…
## $ Arrival.Airport.Service.Rating..out.of.10. <dbl> 9.1, 9.1, 9.0, 9.1, 9.…
## $ Airplane.Type <chr> "", "", "", "", "", "A…
## $ Expected.Departure.Time <chr> "6:10", "7:00", "7:05"…
## $ Departure.Time <chr> "6:10", "7:01", "7:33"…
## $ Departure.Delay <chr> "0:00:00", "0:01:00", …
## $ Duration <chr> "2:20", "2:09", "1:46"…
## $ Expected.Arrival.Time <chr> "8:55", "9:10", "9:10"…
## $ Arrival.Time <chr> "8:30", "9:10", "9:19"…
## $ Arrival.Time.Delay <chr> "-0:25:00", "0:00:00",…
## $ Carrier <chr> "Air India", "Air Indi…
## $ Carrier.Rating..out.of.10. <dbl> 6.6, 6.6, 6.6, 6.6, 6.…
## $ Carrier.Market.Share..out.of.100. <dbl> 12.0, 12.0, 12.0, 12.0…
## $ Carrier.Load.Factor..out.of.100. <dbl> 80.75, 80.75, 80.75, 8…
## $ Carrier.On.Time.Performance.Rating..out.of.100. <dbl> 70.3, 70.3, 70.3, 70.3…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(Airport_delay)
## [1] "Date"
## [2] "Departure.Airport"
## [3] "Departure.Airport.Rating..out.of.10."
## [4] "Departure.Airport.On.Time.Rating..out.of.10."
## [5] "Departure.Airport.Service.Rating..out.of.10."
## [6] "Arrival.Airport"
## [7] "Arrival.Airport.Rating..out.of.10."
## [8] "Arrival.Airport.On.Time.Rating..out.of.10."
## [9] "Arrival.Airport.Service.Rating..out.of.10."
## [10] "Airplane.Type"
## [11] "Expected.Departure.Time"
## [12] "Departure.Time"
## [13] "Departure.Delay"
## [14] "Duration"
## [15] "Expected.Arrival.Time"
## [16] "Arrival.Time"
## [17] "Arrival.Time.Delay"
## [18] "Carrier"
## [19] "Carrier.Rating..out.of.10."
## [20] "Carrier.Market.Share..out.of.100."
## [21] "Carrier.Load.Factor..out.of.100."
## [22] "Carrier.On.Time.Performance.Rating..out.of.100."
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Airport_delay)
## 'data.frame': 14952 obs. of 22 variables:
## $ Date : chr "28-1-18" "28-1-18" "28-1-18" "28-1-18" ...
## $ Departure.Airport : chr "BLR" "CCU" "DEL" "BOM" ...
## $ Departure.Airport.Rating..out.of.10. : num NA NA 7.99 7.29 NA 7.99 NA NA 7.99 NA ...
## $ Departure.Airport.On.Time.Rating..out.of.10. : num NA NA 7.3 6.2 NA 7.3 NA NA 7.3 NA ...
## $ Departure.Airport.Service.Rating..out.of.10. : num NA NA 9.1 9 NA 9.1 NA NA 9.1 NA ...
## $ Arrival.Airport : chr "DEL" "DEL" "HYD" "DEL" ...
## $ Arrival.Airport.Rating..out.of.10. : num 7.99 7.99 8.27 7.99 7.29 8.27 7.29 7.99 8.27 7.29 ...
## $ Arrival.Airport.On.Time.Rating..out.of.10. : num 7.3 7.3 7.8 7.3 6.2 7.8 6.2 7.3 7.8 6.2 ...
## $ Arrival.Airport.Service.Rating..out.of.10. : num 9.1 9.1 9 9.1 9 9 9 9.1 9 9 ...
## $ Airplane.Type : chr "" "" "" "" ...
## $ Expected.Departure.Time : chr "6:10" "7:00" "7:05" "7:00" ...
## $ Departure.Time : chr "6:10" "7:01" "7:33" "7:07" ...
## $ Departure.Delay : chr "0:00:00" "0:01:00" "0:28:00" "0:07:00" ...
## $ Duration : chr "2:20" "2:09" "1:46" "1:40" ...
## $ Expected.Arrival.Time : chr "8:55" "9:10" "9:10" "9:05" ...
## $ Arrival.Time : chr "8:30" "9:10" "9:19" "8:47" ...
## $ Arrival.Time.Delay : chr "-0:25:00" "0:00:00" "0:09:00" "-0:18:00" ...
## $ Carrier : chr "Air India" "Air India" "Air India" "Air India" ...
## $ Carrier.Rating..out.of.10. : num 6.6 6.6 6.6 6.6 6.6 7.2 7.2 7.9 7.9 7.9 ...
## $ Carrier.Market.Share..out.of.100. : num 12 12 12 12 12 8.8 8.8 39.7 39.7 39.7 ...
## $ Carrier.Load.Factor..out.of.100. : num 80.8 80.8 80.8 80.8 80.8 ...
## $ Carrier.On.Time.Performance.Rating..out.of.100.: num 70.3 70.3 70.3 70.3 70.3 91.8 91.8 87.4 87.4 87.4 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Airport_delay)
## Date Departure.Airport Departure.Airport.Rating..out.of.10.
## Length:14952 Length:14952 Min. :7.290
## Class :character Class :character 1st Qu.:7.290
## Mode :character Mode :character Median :7.990
## Mean :7.741
## 3rd Qu.:7.990
## Max. :8.270
## NA's :10043
## Departure.Airport.On.Time.Rating..out.of.10.
## Min. :6.200
## 1st Qu.:6.200
## Median :7.300
## Mean :6.908
## 3rd Qu.:7.300
## Max. :7.800
## NA's :10043
## Departure.Airport.Service.Rating..out.of.10. Arrival.Airport
## Min. :9.000 Length:14952
## 1st Qu.:9.000 Class :character
## Median :9.100 Mode :character
## Mean :9.064
## 3rd Qu.:9.100
## Max. :9.100
## NA's :10043
## Arrival.Airport.Rating..out.of.10. Arrival.Airport.On.Time.Rating..out.of.10.
## Min. :7.29 Min. :6.200
## 1st Qu.:7.99 1st Qu.:7.300
## Median :7.99 Median :7.300
## Mean :7.91 Mean :7.187
## 3rd Qu.:7.99 3rd Qu.:7.300
## Max. :8.27 Max. :7.800
##
## Arrival.Airport.Service.Rating..out.of.10. Airplane.Type
## Min. :9.000 Length:14952
## 1st Qu.:9.000 Class :character
## Median :9.100 Mode :character
## Mean :9.059
## 3rd Qu.:9.100
## Max. :9.100
##
## Expected.Departure.Time Departure.Time Departure.Delay
## Length:14952 Length:14952 Length:14952
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Duration Expected.Arrival.Time Arrival.Time Arrival.Time.Delay
## Length:14952 Length:14952 Length:14952 Length:14952
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Carrier Carrier.Rating..out.of.10.
## Length:14952 Min. :6.600
## Class :character 1st Qu.:6.800
## Mode :character Median :7.200
## Mean :7.531
## 3rd Qu.:7.900
## Max. :9.200
##
## Carrier.Market.Share..out.of.100. Carrier.Load.Factor..out.of.100.
## Min. : 3.6 Min. :80.75
## 1st Qu.: 4.0 1st Qu.:81.80
## Median :12.0 Median :86.00
## Mean :13.2 Mean :86.88
## 3rd Qu.:13.1 3rd Qu.:93.30
## Max. :39.7 Max. :93.90
##
## Carrier.On.Time.Performance.Rating..out.of.100.
## Min. :70.30
## 1st Qu.:74.70
## Median :87.40
## Mean :83.14
## 3rd Qu.:89.10
## Max. :91.80
##
attach(Airport_delay)
## The following object is masked from AQ_city_day:
##
## Date
## The following object is masked from AQ_station_day:
##
## Date
Airport_delay [Airport_delay == ""] <- NA
Airport_delay %>% group_by(Departure.Airport, Departure.Airport.On.Time.Rating..out.of.10.)%>%summarize()
## `summarise()` has grouped output by 'Departure.Airport'. You can override using
## the `.groups` argument.
## # A tibble: 5 × 2
## # Groups: Departure.Airport [5]
## Departure.Airport Departure.Airport.On.Time.Rating..out.of.10.
## <chr> <dbl>
## 1 BLR NA
## 2 BOM 6.2
## 3 CCU NA
## 4 DEL 7.3
## 5 HYD 7.8
##Mumbai seems to have the worst rating for departure on time performance
Airport_delay %>% group_by(Arrival.Airport, Arrival.Airport.On.Time.Rating..out.of.10.)%>%summarize()
## `summarise()` has grouped output by 'Arrival.Airport'. You can override using
## the `.groups` argument.
## # A tibble: 3 × 2
## # Groups: Arrival.Airport [3]
## Arrival.Airport Arrival.Airport.On.Time.Rating..out.of.10.
## <chr> <dbl>
## 1 BOM 6.2
## 2 DEL 7.3
## 3 HYD 7.8
##Mumbai seems to have the worst rating for Arrival on time performance as well
## Remove the entries from the table where tavg is NA
New_Weather_Bangalore <- Weather_Bangalore_imp_lm_temp[complete.cases(Weather_Bangalore),]
New_Weather_Chennai <- Weather_Chennai_imp_lm_temp[complete.cases(Weather_Chennai),]
New_Weather_Delhi <- Weather_Delhi_imp_lm_temp[complete.cases(Weather_Delhi),]
New_Weather_Lucknow <- Weather_Lucknow_imp_lm_temp[complete.cases(Weather_Lucknow),]
New_Weather_Mumbai <- Weather_Mumbai_imp_lm_temp[complete.cases(Weather_Mumbai),]
New_Weather_Jodhpur <- Weather_Jodhpur_imp_lm_temp[complete.cases(Weather_Jodhpur),]
## For Bhubhenshwar and Rourkela, we need to first remove the columns snow and tsun which has no valid entries
## We can also remove the wdir, wspd, pressure columns as the other stations are not having them
## And hence having them does not seem to add value for the scope of this analysis
Standard_Weather_Bhubhneshwar <- subset(Weather_Bhubhneshwar, select = -c(snow,wdir,wspd,pres,tsun,wpgt))
New_Weather_Bhubhneshwar <- Standard_Weather_Bhubhneshwar[complete.cases(Standard_Weather_Bhubhneshwar),]
Standard_Weather_Rourkela <- subset(Weather_Rourkela, select = -c(snow,wdir,wspd,pres,tsun,wpgt))
New_Weather_Rourkela <- Standard_Weather_Rourkela[complete.cases(Standard_Weather_Rourkela),]
## When it comes to AQI stations, we need only active stations
New_AQ_stations <- AQ_stations %>% filter(Status == "Active")
New_AQ_station_hour <- AQ_station_hour_imp_lm_temp[complete.cases(AQ_station_hour),]
New_AQ_station_day <- AQ_station_day_imp_lm_temp[complete.cases(AQ_station_day),]
New_AQ_city_hour <- AQ_city_hour_imp_lm_temp[complete.cases(AQ_city_hour),]
New_AQ_city_day <- AQ_city_day_imp_lm_temp[complete.cases(AQ_city_day),]
## Clean the Airport Delay data too
New_Airport_delay <- Airport_delay[complete.cases(Airport_delay),]
hist(x=New_Weather_Bangalore$tavg, main = "Bangalore Average Temparature")
## Data outside <20 and >30 are outliers for Bangalore average
hist(x=New_Weather_Bangalore$tmin, main = "Bangalore Min Temparature")
## Data outside <16 are outliers for Bangalore min
hist(x=New_Weather_Bangalore$tmax, main = "Bangalore Max Temparature")
## Data outside >35 are outliers for Bangalore min
hist(x=New_Weather_Bangalore$prcp, main = "Bangalore Precipitation", breaks = 5)
## Extreme cases are above 50
## So lets make special dataset
Special_Weather_Bangalore <- New_Weather_Bangalore %>% filter((tavg < 20) | (tavg>30) | (tmin < 16) | (tmax > 35) | (prcp > 50))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Bangalore, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Special_Weather_Bangalore$prcp/75) +
labs(title = "Impact of temperature on precipitation")
## From the picture looks like the extreme precipitation happens either during when tmin is between 16 to 22
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Bangalore <- Special_Weather_Bangalore %>% filter((tmin > 16) & (tmin < 22))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Bangalore, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Ext_Special_Weather_Bangalore$prcp/75) +
labs(title = "Impact of temperature on precipitation")
hist(x=New_Weather_Chennai$tavg, main = "Chennai Average Temparature")
## Data outside <15 and >35 are outliers for Chennai average
hist(x=New_Weather_Chennai$tmin, main = "Chennai Min Temparature")
## Data outside <16 are outliers for Chennai min
hist(x=New_Weather_Chennai$tmax, main = "Chennai Max Temparature")
## Data outside >35 are outliers for Chennai min
hist(x=New_Weather_Chennai$prcp, main = "Chennai Precipitation", breaks = 5)
## Extreme cases are above 50
## So lets make special dataset
Special_Weather_Chennai <- New_Weather_Chennai %>% filter((tavg < 15) | (tavg>35) | (tmin < 10) | (tmax > 30) | (prcp > 50))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Chennai, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Special_Weather_Chennai$prcp/50) +
labs(title = "Impact of temperature on precipitation")
## From the picture looks like the extreme precipitation happens either during when tmin is between 20 to 30
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Chennai <- Special_Weather_Chennai %>% filter((tmin > 20) & (tmin < 30))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Chennai, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Ext_Special_Weather_Chennai$prcp/75) +
labs(title = "Impact of temperature on precipitation")
hist(x=New_Weather_Delhi$tavg, main = "Delhi Average Temparature")
## Data outside <15 and >35 are outliers for Delhi average
hist(x=New_Weather_Delhi$tmin, main = "Delhi Min Temparature")
## Data outside <16 are outliers for Delhi min
hist(x=New_Weather_Delhi$tmax, main = "Delhi Max Temparature")
## Data outside >35 are outliers for Delhi min
hist(x=New_Weather_Delhi$prcp, main = "Delhi Precipitation", breaks = 5)
## Extreme cases are above 50
## So lets make special dataset
Special_Weather_Delhi <- New_Weather_Delhi %>% filter((tavg < 15) | (tavg>35) | (tmin < 10) | (tmax > 30) | (prcp > 50))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Delhi, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Special_Weather_Delhi$prcp/50) +
labs(title = "Impact of temperature on precipitation")
## From the picture looks like the extreme precipitation happens either during when tmin is between 20 to 30
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Delhi <- Special_Weather_Delhi %>% filter((tmin > 20) & (tmin < 30))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Delhi, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Ext_Special_Weather_Delhi$prcp/75) +
labs(title = "Impact of temperature on precipitation")
hist(x=New_Weather_Lucknow$tavg, main = "Lucknow Average Temparature")
## Data outside <16 and >33 are outliers for Lucknow average
hist(x=New_Weather_Lucknow$tmin, main = "Lucknow Min Temparature")
## Data outside <15 are outliers for Lucknow min
hist(x=New_Weather_Lucknow$tmax, main = "Lucknow Max Temparature")
## Data outside >35 are outliers for Lucknow min
hist(x=New_Weather_Lucknow$prcp, main = "Lucknow Precipitation", breaks = 5)
## Extreme cases are above 50
## So lets make special dataset
Special_Weather_Lucknow <- New_Weather_Lucknow %>% filter((tavg < 16) | (tavg>33) | (tmin < 15) | (tmax > 30) | (prcp > 50))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Lucknow, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Special_Weather_Lucknow$prcp/50) +
labs(title = "Impact of temperature on precipitation")
## From the picture looks like the extreme precipitation happens either during when tmin is between 20 to 30
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Lucknow <- Special_Weather_Lucknow %>% filter((tmin > 20) & (tmin < 30))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Lucknow, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Ext_Special_Weather_Lucknow$prcp/75) +
labs(title = "Impact of temperature on precipitation")
hist(x=New_Weather_Mumbai$tavg, main = "Mumbai Average Temparature")
## Data outside <25 and >30 are outliers for Mumbai average
hist(x=New_Weather_Mumbai$tmin, main = "Mumbai Min Temparature")
## Data outside <17 are outliers for Mumbai min
hist(x=New_Weather_Mumbai$tmax, main = "Mumbai Max Temparature")
## Data outside >35 are outliers for Mumbai min
hist(x=New_Weather_Mumbai$prcp, main = "Mumbai Precipitation", breaks = 5)
## Extreme cases are above 50
## So lets make special dataset
Special_Weather_Mumbai <- New_Weather_Mumbai %>% filter((tavg < 25) | (tavg>30) | (tmin < 17) | (tmax > 35) | (prcp > 50))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Mumbai, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Special_Weather_Mumbai$prcp/50) +
labs(title = "Impact of temperature on precipitation")
## From the picture looks like the extreme precipitation happens either during when tmin is between 22 to 27
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Mumbai <- Special_Weather_Mumbai %>% filter((tmin > 22) & (tmin < 27))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Mumbai, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Ext_Special_Weather_Mumbai$prcp/75) +
labs(title = "Impact of temperature on precipitation")
hist(x=New_Weather_Jodhpur$tavg, main = "Jodhpur Average Temparature")
## Data outside <22 and >28 are outliers for Jodhpur average
hist(x=New_Weather_Jodhpur$tmin, main = "Jodhpur Min Temparature")
## Data outside <16 are outliers for Jodhpur min
hist(x=New_Weather_Jodhpur$tmax, main = "Jodhpur Max Temparature")
## Data outside >33 are outliers for Jodhpur min
hist(x=New_Weather_Jodhpur$prcp, main = "Jodhpur Precipitation", breaks = 5)
## Extreme cases are above 50
## So lets make special dataset
Special_Weather_Jodhpur <- New_Weather_Jodhpur %>% filter((tavg < 22) | (tavg>28) | (tmin < 16) | (tmax > 33) | (prcp > 50))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Jodhpur, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Special_Weather_Jodhpur$prcp/50) +
labs(title = "Impact of temperature on precipitation")
## From the picture looks like the extreme precipitation happens either during when tmin is between 17 to 23
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Jodhpur <- Special_Weather_Jodhpur %>% filter((tmin > 17) & (tmin < 23))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Jodhpur, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Ext_Special_Weather_Jodhpur$prcp/75) +
labs(title = "Impact of temperature on precipitation")
hist(x=New_Weather_Bhubhneshwar$tavg, main = "Bhubhenshwar Average Temparature")
## Data outside <24 and >32 are outliers for Bhubhenshwar average
hist(x=New_Weather_Bhubhneshwar$tmin, main = "Bhubhenshwar Min Temparature")
## Data outside <15 are outliers for Bhubhenshwar min
hist(x=New_Weather_Bhubhneshwar$tmax, main = "Bhubhenshwar Max Temparature")
## Data outside >35 are outliers for Bhubhenshwar min
hist(x=New_Weather_Bhubhneshwar$prcp, main = "Bhubhenshwar Precipitation", breaks = 5)
## Extreme cases are above 50
## So lets make special dataset
Special_Weather_Bhubhenshwar <- New_Weather_Bhubhneshwar %>% filter((tavg < 24) | (tavg>32) | (tmin < 15) | (tmax > 35) | (prcp > 50))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Bhubhenshwar, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Special_Weather_Bhubhenshwar$prcp/50) +
labs(title = "Impact of temperature on precipitation")
## From the picture looks like the extreme precipitation happens either during when tmin is between 17 to 27
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Bhubhenshwar <- Special_Weather_Bhubhenshwar %>% filter((tmin > 17) & (tmin < 27))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Bhubhenshwar, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Ext_Special_Weather_Bhubhenshwar$prcp/75) +
labs(title = "Impact of temperature on precipitation")
hist(x=New_Weather_Rourkela$tavg, main = "Rourkela Average Temparature")
## Data outside <20 and >32 are outliers for Rourkela average
hist(x=New_Weather_Rourkela$tmin, main = "Rourkela Min Temparature")
## Data outside <15 are outliers for Rourkela min
hist(x=New_Weather_Rourkela$tmax, main = "Rourkela Max Temparature")
## Data outside >35 are outliers for Rourkela min
hist(x=New_Weather_Rourkela$prcp, main = "Rourkela Precipitation", breaks = 5)
## Extreme cases are above 40
## So lets make special dataset
Special_Weather_Rourkela <- New_Weather_Rourkela %>% filter((tavg < 20) | (tavg>32) | (tmin < 15) | (tmax > 30) | (prcp > 40))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Rourkela, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Special_Weather_Rourkela$prcp/50) +
labs(title = "Impact of temperature on precipitation")
## From the picture looks like the extreme precipitation happens either during when tmin is between 22 to 27
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Rourkela <- Special_Weather_Rourkela %>% filter((tmin > 22) & (tmin < 27))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Rourkela, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Ext_Special_Weather_Rourkela$prcp/75) +
labs(title = "Impact of temperature on precipitation")
head(New_AQ_station_hour)
## # A tibble: 6 × 33
## StationId Datetime PM2.5 PM10 NO NO2 NOx NH3 CO SO2 O3
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AP001 2017-11-25 09… 104 148. 1.93 23 13.8 9.8 0.1 15.3 118.
## 2 AP001 2017-11-25 10… 94.5 142 1.33 16.2 9.75 9.65 0.1 17 136.
## 3 AP001 2017-11-25 11… 82.8 126. 1.47 14.8 9.07 9.7 0.1 15.4 150.
## 4 AP001 2017-11-25 14… 68.5 117 1.35 13.6 8.35 7.4 0.1 21.8 162.
## 5 AP001 2017-11-25 15… 69.2 112. 1.52 11.8 7.55 9.25 0.1 21.4 162.
## 6 AP001 2017-11-25 16… 70 107 2.8 30.3 18.4 6.15 0.1 18.9 148.
## # ℹ 22 more variables: Benzene <dbl>, Toluene <dbl>, Xylene <dbl>, AQI <dbl>,
## # AQI_Bucket <chr>, StationId_NA <fct>, Datetime_NA <fct>, PM2.5_NA <fct>,
## # PM10_NA <fct>, NO_NA <fct>, NO2_NA <fct>, NOx_NA <fct>, NH3_NA <fct>,
## # CO_NA <fct>, SO2_NA <fct>, O3_NA <fct>, Benzene_NA <fct>, Toluene_NA <fct>,
## # Xylene_NA <fct>, AQI_NA <fct>, AQI_Bucket_NA <fct>, any_missing <chr>
# Lets see the performance of the AQI over years
AQ_station_Day_Sep <- New_AQ_station_hour %>% separate(Datetime, c('Date', 'Time'), sep =" ") %>% separate(Time, c('Hr', 'Min', 'Sec'), sep=":") %>% mutate(Hour = as.numeric(Hr))
AQ_station_Day_Duration <- AQ_station_Day_Sep %>% mutate(Duration=cut(Hour, breaks=c(-1, 6, 18, 24),labels=c("Early_Morning","Day","Night")))
AQI_Over_Years <- AQ_station_Day_Duration%>% group_by(YEAR = year(ymd(Date)), AQI_Bucket) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_Over_Years, aes(x = YEAR, y = Mean_AQI, color=AQI_Bucket))+ geom_line()
## It appears that 'Severe' and 'Poor' cases didn't exist much until 2017 from which these
## two gained at the behest of 'Good' AQI cases
# Lets see the performance of the AQI over a day in every year
AQI_Over_Time <- AQ_station_Day_Duration %>% group_by(YEAR = year(ymd(Date)), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_Over_Time, aes(x = YEAR, y = Mean_AQI, color = Duration))+ geom_point(size = AQI_Over_Time$Duration)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## We can see that the year 2017 had witnessed the worst air quality index but much of that
## was during the day time. Things slowed down in the years later but in them,
## but the pattern changed by having night time pollution as the worst.
## In all cases, early morning pollution was the lowest.
# Lets see the performance of the AQI monthwise
AQI_monthwise <- AQ_station_Day_Duration %>% group_by(Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'Month'. You can override using the
## `.groups` argument.
ggplot(AQI_monthwise, aes(x = Month, y = Mean_AQI, color = Duration))+ geom_point(size = AQI_monthwise$Duration)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## We can see that the colder months - i.e., from Oct to Feb, the AQI is the worst, its bad during summer but it appears the best in monsoon season.
AQI_Over_month <- AQ_station_Day_Duration %>% group_by(YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR', 'Month'. You can override using the
## `.groups` argument.
# Lets see if how this works out yearwise and monthwise
ggplot(AQI_Over_month, aes(x = Month, y = Mean_AQI, color = Duration))+ geom_point(size = AQI_Over_month$Duration) + facet_wrap(~YEAR)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## We can see the same trend every year - i.e., the colder months has the worst AQI while the monsoon has the best AQI while summer/spring time having the intermediate values
## Now lets report this city wise - probably for the Month wise combination
AQI_Stationwise <- AQ_station_Day_Duration %>% group_by(Station = StationId, YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'Station', 'YEAR', 'Month'. You can
## override using the `.groups` argument.
ggplot(AQI_Stationwise, aes(x = Month, y = Mean_AQI, color = Station))+ geom_point(shape = AQI_Stationwise$Duration)
## Across stations, the trend seems to be the same - i.e., worst during winter, intermediate during spring/summer, best during monsoon.
## Now out of the 19 stations, we are very interested on just interested on Delhi for which we are going to do air traffic impact analysis - so lets filter them and zoom into their performance alone
AQI_Delhi_Station <- AQ_station_Day_Duration %>% filter( (StationId == "DL001") | (StationId == "DL019")) %>% group_by(Station = StationId, YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'Station', 'YEAR', 'Month'. You can
## override using the `.groups` argument.
ggplot(AQI_Delhi_Station, aes(x = Month, y = Mean_AQI, color = Duration))+ geom_point(size = AQI_Delhi_Station$Duration) + facet_wrap(~YEAR)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
# Lets see how AQ day data is different from station hour wise data
New_AQ_station_day_Years <- New_AQ_station_day%>% group_by(YEAR = year(ymd(Date)), AQI_Bucket) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
head(AQ_station_Day_Duration)
## # A tibble: 6 × 38
## StationId Date Hr Min Sec PM2.5 PM10 NO NO2 NOx NH3 CO
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AP001 2017-11… 09 00 00 104 148. 1.93 23 13.8 9.8 0.1
## 2 AP001 2017-11… 10 00 00 94.5 142 1.33 16.2 9.75 9.65 0.1
## 3 AP001 2017-11… 11 00 00 82.8 126. 1.47 14.8 9.07 9.7 0.1
## 4 AP001 2017-11… 14 00 00 68.5 117 1.35 13.6 8.35 7.4 0.1
## 5 AP001 2017-11… 15 00 00 69.2 112. 1.52 11.8 7.55 9.25 0.1
## 6 AP001 2017-11… 16 00 00 70 107 2.8 30.3 18.4 6.15 0.1
## # ℹ 26 more variables: SO2 <dbl>, O3 <dbl>, Benzene <dbl>, Toluene <dbl>,
## # Xylene <dbl>, AQI <dbl>, AQI_Bucket <chr>, StationId_NA <fct>,
## # Datetime_NA <fct>, PM2.5_NA <fct>, PM10_NA <fct>, NO_NA <fct>,
## # NO2_NA <fct>, NOx_NA <fct>, NH3_NA <fct>, CO_NA <fct>, SO2_NA <fct>,
## # O3_NA <fct>, Benzene_NA <fct>, Toluene_NA <fct>, Xylene_NA <fct>,
## # AQI_NA <fct>, AQI_Bucket_NA <fct>, any_missing <chr>, Hour <dbl>,
## # Duration <fct>
## There seems to be nothing new that we can derive out of the station day wise that we can't derive out of
## station hour wise data. so no further analysis needed over here
## Lets look at City wise hourly AQI data
head(New_AQ_city_hour)
## # A tibble: 6 × 33
## City Datetime PM2.5 PM10 NO NO2 NOx NH3 CO SO2 O3 Benzene
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Amarav… 2017-11… 104 148. 1.93 23 13.8 9.8 0.1 15.3 118. 0.3
## 2 Amarav… 2017-11… 94.5 142 1.33 16.2 9.75 9.65 0.1 17 136. 0.28
## 3 Amarav… 2017-11… 82.8 126. 1.47 14.8 9.07 9.7 0.1 15.4 150. 0.2
## 4 Amarav… 2017-11… 68.5 117 1.35 13.6 8.35 7.4 0.1 21.8 162. 0.1
## 5 Amarav… 2017-11… 69.2 112. 1.52 11.8 7.55 9.25 0.1 21.4 162. 0.1
## 6 Amarav… 2017-11… 70 107 2.8 30.3 18.4 6.15 0.1 18.9 148. 0.1
## # ℹ 21 more variables: Toluene <dbl>, Xylene <dbl>, AQI <dbl>,
## # AQI_Bucket <chr>, City_NA <fct>, Datetime_NA <fct>, PM2.5_NA <fct>,
## # PM10_NA <fct>, NO_NA <fct>, NO2_NA <fct>, NOx_NA <fct>, NH3_NA <fct>,
## # CO_NA <fct>, SO2_NA <fct>, O3_NA <fct>, Benzene_NA <fct>, Toluene_NA <fct>,
## # Xylene_NA <fct>, AQI_NA <fct>, AQI_Bucket_NA <fct>, any_missing <chr>
# Lets see the performance of the AQI over years
AQ_city_Day_Sep <- New_AQ_city_hour %>% separate(Datetime, c('Date', 'Time'), sep =" ") %>% separate(Time, c('Hr', 'Min', 'Sec'), sep=":") %>% mutate(Hour = as.numeric(Hr))
AQ_city_Day_Duration <- AQ_city_Day_Sep %>% mutate(Duration=cut(Hour, breaks=c(-1, 6, 18, 24),labels=c("Early_Morning","Day","Night")))
## Now get it grouped by Year and plot year wise performance
AQI_City_Over_Years <- AQ_city_Day_Duration%>% group_by(YEAR = year(ymd(Date)), AQI_Bucket) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_City_Over_Years, aes(x = YEAR, y = Mean_AQI, color=AQI_Bucket))+ geom_line()
AQI_City_Over_Time <- AQ_city_Day_Duration %>% group_by(YEAR = year(ymd(Date)), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_City_Over_Time, aes(x = YEAR, y = Mean_AQI, color = Duration))+ geom_point(size = AQI_City_Over_Time$Duration)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## It appears 2015 had peak values of AQIs, which dropped to very low in 2016, gained to half the levels back in 2017 and then gradually reducing
## We can see that 2015-2017 worst was during day time but from 2018, there were worse night times - may be something to do with dropped levels of AQIs as well
## In all cases, early morning pollution seems to be the lowest.
# Lets see the performance of the AQI month wise
AQI_City_monthwise <- AQ_city_Day_Duration %>% group_by(Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'Month'. You can override using the
## `.groups` argument.
ggplot(AQI_City_monthwise, aes(x = Month, y = Mean_AQI, color = Duration))+ geom_point(size = AQI_City_monthwise$Duration)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## We can see that the winter months - i.e., from Oct to Feb, the AQI is the worst, its bad during summer but it appears the best in monsoon season. The difference between stationwise data is that, here Nov seems to be the worst month while in ther other dataset, Dec held the worst...
AQI_City_Over_month <- AQ_city_Day_Duration %>% group_by(YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR', 'Month'. You can override using the
## `.groups` argument.
AQI_City_Over_month
## # A tibble: 185 × 4
## # Groups: YEAR, Month [63]
## YEAR Month Duration Mean_AQI
## <dbl> <ord> <fct> <dbl>
## 1 2015 Jan Early_Morning 343.
## 2 2015 Jan Day 341.
## 3 2015 Jan Night 341.
## 4 2015 Feb Early_Morning 329.
## 5 2015 Feb Day 329.
## 6 2015 Feb Night 325.
## 7 2015 Mar Early_Morning 249.
## 8 2015 Mar Day 262.
## 9 2015 Mar Night 254.
## 10 2015 Apr Early_Morning 304.
## # ℹ 175 more rows
# Lets see if how this works out yearwise and monthwise
ggplot(AQI_City_Over_month, aes(x = Month, y = Mean_AQI, color = Duration))+ geom_point(size = AQI_City_Over_month$Duration) + facet_wrap(~YEAR)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## We can see the same trend every year - i.e., the winter months has the worst AQI while the monsoon has the best AQI while summer/spring time having the intermediate values
## Now lets report this city wise - probably for the Month wise combination
AQI_Citywise <- AQ_city_Day_Sep %>% group_by(City, YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE)) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'City', 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_Citywise, aes(x = Month, y = Mean_AQI))+ geom_point(aes(color=City))
## Across stations, the trend seems to be the same - i.e., worst during winter, intermediate during spring/summer, best during monsoon.
## Now out of all the cities, we are very interested on Delhi for which we are going to do air traffic impact analysis - so lets filter them and zoom into their performance alone
AQI_Delhi_City <- AQ_city_Day_Sep %>% filter( City == "Delhi") %>% group_by(City, YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE)) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'City', 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_Delhi_City, aes(x = Month, y = Mean_AQI))+ geom_point() + facet_wrap(~YEAR)
New_AQ_city_day_Years <- New_AQ_city_day%>% group_by(City, YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE)) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'City', 'YEAR'. You can override using the
## `.groups` argument.
head(New_AQ_city_day_Years)
## # A tibble: 6 × 4
## # Groups: City, YEAR [2]
## City YEAR Month Mean_AQI
## <chr> <dbl> <ord> <dbl>
## 1 Amaravati 2017 Nov 184.
## 2 Amaravati 2017 Dec 194.
## 3 Amaravati 2018 Jan 172.
## 4 Amaravati 2018 Feb 107.
## 5 Amaravati 2018 Mar 84.6
## 6 Amaravati 2018 Apr 63.8
ggplot(New_AQ_city_day_Years, aes(x = Month, y = Mean_AQI))+ geom_point(aes(color=City))
## There seems to be small difference when comparing hour wise data to day wise data, but not significant enough. So we will use mainly the hour wise datafor citiwise analysis.
## We would like to understand which of the parameters are really affecting AQI value.
## Based on the analysis above we will stick to using the Cleaned Station hour wise datasets.
New_AQ_station_hour_sep <- New_AQ_station_hour %>% separate(Datetime, c('Date', 'Time'), sep =" ") %>% separate(Time, c('Hr', 'Min', 'Sec'), sep=":") %>% mutate(Hour = as.numeric(Hr), Month = month(ymd(Date)))
## Now lets focus on the months where we have the most troubles with AQI - Oct to Feb
New_AQ_station_hour_sep_BM <- New_AQ_station_hour_sep %>% filter ((Month == 1) | (Month == 2) | (Month == 10) | (Month == 11) | (Month == 12))
AQI_O3_model <- lm(AQI~O3, data = New_AQ_station_hour_sep)
fmodel(AQI_O3_model)
## OK vow, looks like AQI has direct relationship with the O3 content
AQI_O3_model_BM <- lm(AQI~O3, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_O3_model_BM)
## In bad months looks like O3 and AQI are inversely proportional
## Lets try with PM2.5
AQI_PM_2_5_model <- lm(AQI~PM2.5, data = New_AQ_station_hour_sep)
fmodel(AQI_PM_2_5_model)
## OK even here there is an impact - actually much more
AQI_PM_2_5_model_BM <- lm(AQI~PM2.5, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_PM_2_5_model_BM)
## PM2.5 impact seems to be much higher over the winter months
##Lets try others
AQI_PM_10_model <- lm(AQI~PM10, data = New_AQ_station_hour_sep)
fmodel(AQI_PM_10_model)
AQI_PM_10_model_BM <- lm(AQI~PM10, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_PM_10_model_BM)
## No significant impact change in winter months for PM10
AQI_NO_model <- lm(AQI~NO, data = New_AQ_station_hour_sep)
fmodel(AQI_NO_model)
AQI_NO_model_BM <- lm(AQI~NO, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_NO_model_BM)
## Slight reduction in winter months for NO
AQI_NO2_model <- lm(AQI~NO2, data = New_AQ_station_hour_sep)
fmodel(AQI_NO2_model)
AQI_NO2_model_BM <- lm(AQI~NO2, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_NO2_model_BM)
## No significant impact change in winter months for NO2
AQI_NOx_model <- lm(AQI~NOx, data = New_AQ_station_hour_sep)
fmodel(AQI_NOx_model)
AQI_NOx_model_BM <- lm(AQI~NOx, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_NOx_model_BM)
## Slight reduction in winter months for NOx
AQI_NH3_model <- lm(AQI~NH3, data = New_AQ_station_hour_sep)
fmodel(AQI_NH3_model)
AQI_NH3_model_BM <- lm(AQI~NH3, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_NH3_model_BM)
## PM2.5 impact seems to be much higher (50% more) over the winter months
AQI_CO_model <- lm(AQI~CO, data = New_AQ_station_hour_sep)
fmodel(AQI_CO_model)
AQI_CO_model_BM <- lm(AQI~CO, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_CO_model_BM)
## No significant impact change in winter months for CO
AQI_SO2_model <- lm(AQI~SO2, data = New_AQ_station_hour_sep)
fmodel(AQI_SO2_model)
AQI_SO2_model_BM <- lm(AQI~SO2, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_SO2_model_BM)
## Slight reduction in winter months for SO2
AQI_Benzene_model <- lm(AQI~Benzene, data = New_AQ_station_hour_sep)
fmodel(AQI_Benzene_model)
AQI_Benzene_model <- lm(AQI~Benzene, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_Benzene_model)
## Slight reduction in winter months for Benzene
AQI_Toluene_model <- lm(AQI~Toluene, data = New_AQ_station_hour_sep)
fmodel(AQI_Toluene_model)
AQI_Toluene_model_BM <- lm(AQI~Toluene, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_Toluene_model_BM)
## Slight reduction in winter months for Toulene
AQI_Xylene_model <- lm(AQI~Xylene, data = New_AQ_station_hour_sep)
fmodel(AQI_Xylene_model)
AQI_Xylene_model_BM <- lm(AQI~Xylene, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_Xylene_model_BM)
## No significant impact change in winter months for Xylene
## Among these, the highest impact seems to be from CO. Bringing in
## O3 due to their peculiar reversal in Winter months
AQI_High_Impact_model <- lm(AQI~O3+CO, data = New_AQ_station_hour_sep)
fmodel(AQI_High_Impact_model)
AQI_High_Impact_model_BM <- lm(AQI~O3+CO, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_High_Impact_model_BM)
evaluate_model(AQI_High_Impact_model)
## O3 CO model_output
## 1 0 0.0 67.22385
## 2 50 0.0 88.07392
## 3 100 0.0 108.92398
## 4 0 0.5 95.43036
## 5 50 0.5 116.28043
## 6 100 0.5 137.13050
## 7 0 1.0 123.63687
## 8 50 1.0 144.48694
## 9 100 1.0 165.33701
evaluate_model(AQI_High_Impact_model_BM)
## O3 CO model_output
## 1 0 0 108.7309
## 2 50 0 116.8346
## 3 100 0 124.9383
## 4 0 1 157.0225
## 5 50 1 165.1262
## 6 100 1 173.2299
## 7 0 2 205.3141
## 8 50 2 213.4178
## 9 100 2 221.5215
## Defintely bad months brings in a lot of diffeence into the data set.
## So lets consider even months as one explanatory variables
New_AQ_station_hour_sep
## # A tibble: 203,693 × 38
## StationId Date Hr Min Sec PM2.5 PM10 NO NO2 NOx NH3 CO
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AP001 2017-1… 09 00 00 104 148. 1.93 23 13.8 9.8 0.1
## 2 AP001 2017-1… 10 00 00 94.5 142 1.33 16.2 9.75 9.65 0.1
## 3 AP001 2017-1… 11 00 00 82.8 126. 1.47 14.8 9.07 9.7 0.1
## 4 AP001 2017-1… 14 00 00 68.5 117 1.35 13.6 8.35 7.4 0.1
## 5 AP001 2017-1… 15 00 00 69.2 112. 1.52 11.8 7.55 9.25 0.1
## 6 AP001 2017-1… 16 00 00 70 107 2.8 30.3 18.4 6.15 0.1
## 7 AP001 2017-1… 17 00 00 72.8 120. 1.5 26.7 15.4 10.8 0.1
## 8 AP001 2017-1… 18 00 00 81.5 135. 1.1 18.8 10.9 14.7 0.1
## 9 AP001 2017-1… 19 00 00 85 142. 1.62 26.2 15.3 14.5 0.2
## 10 AP001 2017-1… 20 00 00 91.5 146. 0.98 18.9 10.8 14.1 0.2
## # ℹ 203,683 more rows
## # ℹ 26 more variables: SO2 <dbl>, O3 <dbl>, Benzene <dbl>, Toluene <dbl>,
## # Xylene <dbl>, AQI <dbl>, AQI_Bucket <chr>, StationId_NA <fct>,
## # Datetime_NA <fct>, PM2.5_NA <fct>, PM10_NA <fct>, NO_NA <fct>,
## # NO2_NA <fct>, NOx_NA <fct>, NH3_NA <fct>, CO_NA <fct>, SO2_NA <fct>,
## # O3_NA <fct>, Benzene_NA <fct>, Toluene_NA <fct>, Xylene_NA <fct>,
## # AQI_NA <fct>, AQI_Bucket_NA <fct>, any_missing <chr>, Hour <dbl>, …
AQI_High_Impact_model_Month <- lm(AQI~O3+CO+month(ymd(Date)), data = New_AQ_station_hour_sep)
fmodel(AQI_High_Impact_model_Month)
evaluate_model(AQI_High_Impact_model_Month)
## O3 CO Date model_output
## 1 0 0.0 2020-03-10 65.46391
## 2 50 0.0 2020-03-10 86.35594
## 3 100 0.0 2020-03-10 107.24797
## 4 0 0.5 2020-03-10 93.61949
## 5 50 0.5 2020-03-10 114.51152
## 6 100 0.5 2020-03-10 135.40355
## 7 0 1.0 2020-03-10 121.77507
## 8 50 1.0 2020-03-10 142.66710
## 9 100 1.0 2020-03-10 163.55913
## 10 0 0.0 2020-03-07 65.46391
## 11 50 0.0 2020-03-07 86.35594
## 12 100 0.0 2020-03-07 107.24797
## 13 0 0.5 2020-03-07 93.61949
## 14 50 0.5 2020-03-07 114.51152
## 15 100 0.5 2020-03-07 135.40355
## 16 0 1.0 2020-03-07 121.77507
## 17 50 1.0 2020-03-07 142.66710
## 18 100 1.0 2020-03-07 163.55913
## 19 0 0.0 2020-03-08 65.46391
## 20 50 0.0 2020-03-08 86.35594
## 21 100 0.0 2020-03-08 107.24797
## 22 0 0.5 2020-03-08 93.61949
## 23 50 0.5 2020-03-08 114.51152
## 24 100 0.5 2020-03-08 135.40355
## 25 0 1.0 2020-03-08 121.77507
## 26 50 1.0 2020-03-08 142.66710
## 27 100 1.0 2020-03-08 163.55913
## Having month as part of the model really makes a difference to the evaluation.
## Now lets train the model and see if we can predict the values of AQI
#make this split reproducible
set.seed(1)
#Use 70% of dataset as training set and remaining 30% as testing set
AQI_sample_set <- sample(c(TRUE, FALSE), nrow(New_AQ_station_hour_sep), replace=TRUE, prob=c(0.7,0.3))
AQI_train_dataset <- New_AQ_station_hour_sep[AQI_sample_set, ]
AQI_test_dataset <- New_AQ_station_hour_sep[!AQI_sample_set, ]
AQI_Eval_model = lm(AQI~O3+CO+month(ymd(Date)), data = AQI_train_dataset)
summary(AQI_Eval_model)
##
## Call:
## lm(formula = AQI ~ O3 + CO + month(ymd(Date)), data = AQI_train_dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2339.28 -41.21 -14.55 18.77 728.61
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63.602092 0.464663 136.88 <2e-16 ***
## O3 0.419003 0.006315 66.35 <2e-16 ***
## CO 56.352626 0.294424 191.40 <2e-16 ***
## month(ymd(Date)) 0.576123 0.048342 11.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 66.87 on 142592 degrees of freedom
## Multiple R-squared: 0.213, Adjusted R-squared: 0.213
## F-statistic: 1.286e+04 on 3 and 142592 DF, p-value: < 2.2e-16
Predicted_AQI_Values <- predict(AQI_Eval_model, AQI_test_dataset)
AQI_test_dataset["Predicted_AQI"] <- Predicted_AQI_Values
Summary_AQI_Model_Performace <- AQI_test_dataset %>% group_by(YEAR = year(ymd(Date)), Month) %>% summarise(AQI, Predicted_AQI)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'YEAR', 'Month'. You can override using the
## `.groups` argument.
Summary_AQI_Model_Performace
## # A tibble: 61,097 × 4
## # Groups: YEAR, Month [58]
## YEAR Month AQI Predicted_AQI
## <dbl> <dbl> <dbl> <dbl>
## 1 2015 9 61 95.8
## 2 2015 9 61 117.
## 3 2015 9 72 126.
## 4 2015 9 72 95.4
## 5 2015 9 77 119.
## 6 2015 9 66 129.
## 7 2015 9 66 102.
## 8 2015 9 66 108.
## 9 2015 9 62 96.7
## 10 2015 9 62 106.
## # ℹ 61,087 more rows
ggplot(Summary_AQI_Model_Performace, aes(x = Month)) +
geom_point(aes(y = AQI, color = 'AQI')) +
geom_point(aes(y = Predicted_AQI, color = 'Predictede_AQI')) +
scale_x_continuous(breaks=seq(1, 12, by = 1))+
labs(title = "AQI Model Performance") + facet_wrap(~YEAR)
## We can see that there are a good amount overlaps between the AQI prediction vs actual data though
## there is still a very large scope of improvement of the model - esp when dealing with outliers.
## But so far we have sufficient proof available that AQI is heavily influenced by
## month of the year and quantities of O3 and CO.
## We have seen how components of air impacted AQI
## Time to see the impact of weather on AQI by merging the station day wise data with the weather data
## Please note we are not picking up station hour wise data because the weather data we have is only day wise data
## Out of the cities for which weather has been provided, the only city that overlaps with the AQI data is Delhi
## And ofcourse we are trying to find the impact of AQI on Airtraffic in Delhi, so lets bring in that too
## So lets merge these three datasets only for Delhi
Delhi_AQI_data_temp <- New_AQ_station_day %>% filter ((StationId == "DL001") | (StationId == "DL019"))%>% mutate(Date_1 = ymd(as.Date(Date)))
Delhi_AQI_data <- Delhi_AQI_data_temp[, -2] %>% rename("Date" = "Date_1")
Delhi_AQI_data
## # A tibble: 1,126 × 33
## StationId PM2.5 PM10 NO NO2 NOx NH3 CO SO2 O3 Benzene
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 DL001 238. 349. 3.25 79.0 44.6 36.0 1.53 14.4 45.3 6.64
## 2 DL001 285. 427. 20.0 113. 76.2 41.2 1.43 21.3 30.4 8.8
## 3 DL001 150. 214. 6.35 96.2 56.3 36.0 0.99 12.7 41.9 5.64
## 4 DL001 106. 154. 7.36 77.5 47.2 30.4 0.7 12.5 27.9 4.2
## 5 DL001 146. 211. 2.01 60.3 33.6 28.5 0.71 11.6 46.1 2.61
## 6 DL001 167. 272. 9.92 80.4 50.8 29.9 0.98 14 33.5 3.84
## 7 DL001 219. 347. 23.9 99.6 72.5 35.0 1.33 19.8 44.8 5.56
## 8 DL001 227. 356. 2.1 77.9 43.2 37.1 1.02 21.2 52.6 4.49
## 9 DL001 86.5 206. 2.71 58.0 33.0 25.6 0.59 18.2 39.4 2.33
## 10 DL001 150. 287. 23.2 70.8 56.6 28.2 1 20.7 34.0 3.84
## # ℹ 1,116 more rows
## # ℹ 22 more variables: Toluene <dbl>, Xylene <dbl>, AQI <dbl>,
## # AQI_Bucket <chr>, StationId_NA <fct>, Date_NA <fct>, PM2.5_NA <fct>,
## # PM10_NA <fct>, NO_NA <fct>, NO2_NA <fct>, NOx_NA <fct>, NH3_NA <fct>,
## # CO_NA <fct>, SO2_NA <fct>, O3_NA <fct>, Benzene_NA <fct>, Toluene_NA <fct>,
## # Xylene_NA <fct>, AQI_NA <fct>, AQI_Bucket_NA <fct>, any_missing <chr>,
## # Date <date>
New_Weather_Delhi_day <- New_Weather_Delhi %>% mutate(Date = dmy(time))
Delhi_Airport_Delay_date <- New_Airport_delay %>% filter (Departure.Airport == "DEL") %>% mutate(Date_1 = dmy(Date))
Delhi_Airport_Delay_rename <- Delhi_Airport_Delay_date[, -1] %>% rename("Date" = "Date_1")
Delhi_Airport_Delay_date_sorted <- Delhi_Airport_Delay_rename[order(Delhi_Airport_Delay_rename$Date),]
## The range of weather data is from 01/01/1990 to 25/07/2022
## The range of airport delay data is from 28/01/18 to 27/1/2020
## So the overlapping range is from 1/11/2018 to 26/1/2020
Delhi_Airport_Delay_range <- Delhi_Airport_Delay_date_sorted %>% filter (between(Date, as.Date('2018-01-25'), as.Date('2020-01-27')))
#Delhi_Airport_Delay <- Delhi_Airport_Delay_dates %>% filter ((Date >'25-01-18') & (Date < '29-01-20')) #1925
New_Weather_Delhi_day_range <- New_Weather_Delhi_day %>% filter (between(Date, as.Date('2018-01-25'), as.Date('2020-01-27')))
Delhi_AQI_data_range <- Delhi_AQI_data %>% filter (between(Date, as.Date('2018-01-25'), as.Date('2020-01-27')))
##Delhi_Airport_Delay data has multiple entries for a day as it is cutting across many airliners operating on an airport. But we are interested in average delay per day and not really on the airliner related information. So, lets clean the data a bit there.
convert_min <- function(x)
{
if(x < 0)
{
time_mins = 0
}
else
{
time_d <- hms(x)
time_mins <- hour(time_d)*60 + minute(time_d)
}
}
Delhi_Airport_Delay_in_min <- Delhi_Airport_Delay_range %>% mutate (Departure_Delay_min = unlist(lapply(Departure.Delay, convert_min)), Arrival_Delay_min = unlist(lapply(Arrival.Time.Delay, convert_min)))
Delhi_Airport_Delay_datewise <- Delhi_Airport_Delay_in_min %>% group_by(Date) %>% summarize(Daily_Delay = sum(Departure_Delay_min + Arrival_Delay_min))
Delhi_AQI_weather_data_merge_temp <- merge(New_Weather_Delhi_day, Delhi_AQI_data)
Delhi_AQI_weather_data_merge_temp_1 <- Delhi_AQI_weather_data_merge_temp[,-3]
Delhi_cohesive_dataset <- merge(Delhi_AQI_weather_data_merge_temp_1, Delhi_Airport_Delay_datewise)%>% mutate(Month = month(ymd(Date)))
Delhi_cohesive_dataset
## Date any_missing tavg tmin tmax prcp time_NA tavg_NA tmin_NA tmax_NA
## 1 2018-06-27 Not Missing 30.3 26.2 37.5 3.0 !NA !NA !NA !NA
## 2 2018-06-28 Not Missing 29.9 24.2 37.5 20.1 !NA !NA !NA !NA
## 3 2018-06-29 Not Missing 30.7 27.9 35.2 1.0 !NA !NA !NA !NA
## 4 2018-06-30 Not Missing 31.3 27.5 35.6 9.9 !NA !NA !NA !NA
## 5 2018-07-04 Not Missing 31.7 26.1 36.7 5.1 !NA !NA !NA !NA
## 6 2018-07-06 Not Missing 32.9 28.1 37.3 5.1 !NA !NA !NA !NA
## 7 2018-07-07 Not Missing 33.7 28.2 39.1 0.0 !NA !NA !NA !NA
## 8 2018-07-12 Not Missing 32.4 28.6 36.7 0.0 !NA !NA !NA !NA
## 9 2018-07-14 Not Missing 30.3 25.4 34.2 22.1 !NA !NA !NA !NA
## 10 2018-07-15 Not Missing 31.8 27.6 36.3 7.1 !NA !NA !NA !NA
## 11 2018-07-16 Not Missing 29.8 28.1 35.4 0.0 !NA !NA !NA !NA
## 12 2018-07-17 Not Missing 32.6 28.0 37.1 24.9 !NA !NA !NA !NA
## 13 2018-07-18 Not Missing 32.9 28.6 37.2 0.0 !NA !NA !NA !NA
## 14 2018-07-19 Not Missing 32.8 28.8 37.3 0.8 !NA !NA !NA !NA
## 15 2018-07-20 Not Missing 29.2 27.1 37.8 23.9 !NA !NA !NA !NA
## 16 2018-07-21 Not Missing 29.2 27.4 33.4 0.0 !NA !NA !NA !NA
## 17 2018-07-22 Not Missing 29.0 26.0 34.4 18.0 !NA !NA !NA !NA
## 18 2018-07-23 Not Missing 30.7 26.2 34.6 70.1 !NA !NA !NA !NA
## 19 2018-07-24 Not Missing 32.4 28.2 36.4 0.0 !NA !NA !NA !NA
## 20 2018-07-26 Not Missing 26.9 26.2 32.2 5.1 !NA !NA !NA !NA
## 21 2018-07-27 Not Missing 26.7 25.2 28.7 46.0 !NA !NA !NA !NA
## 22 2018-07-28 Not Missing 28.2 25.4 31.8 3.0 !NA !NA !NA !NA
## 23 2018-07-29 Not Missing 30.3 26.0 34.4 5.1 !NA !NA !NA !NA
## 24 2018-07-30 Not Missing 30.9 23.0 34.8 24.9 !NA !NA !NA !NA
## 25 2018-07-31 Not Missing 30.0 27.4 34.5 0.0 !NA !NA !NA !NA
## 26 2018-08-06 Not Missing 28.1 26.6 33.2 3.0 !NA !NA !NA !NA
## 27 2018-08-07 Not Missing 28.2 26.4 31.8 2.0 !NA !NA !NA !NA
## 28 2018-08-08 Not Missing 30.4 26.5 33.4 4.1 !NA !NA !NA !NA
## 29 2018-08-09 Not Missing 32.0 27.6 35.4 0.5 !NA !NA !NA !NA
## 30 2018-11-03 Not Missing 25.2 18.5 31.6 0.3 !NA !NA !NA !NA
## 31 2018-11-04 Not Missing 24.4 18.0 30.6 0.0 !NA !NA !NA !NA
## 32 2018-11-14 Not Missing 22.2 17.6 29.4 7.1 !NA !NA !NA !NA
## 33 2018-11-14 Not Missing 22.2 17.6 29.4 7.1 !NA !NA !NA !NA
## 34 2018-11-15 Not Missing 22.2 16.2 28.5 1.0 !NA !NA !NA !NA
## 35 2018-11-15 Not Missing 22.2 16.2 28.5 1.0 !NA !NA !NA !NA
## 36 2018-11-16 Not Missing 21.1 14.5 27.5 0.0 !NA !NA !NA !NA
## 37 2018-11-16 Not Missing 21.1 14.5 27.5 0.0 !NA !NA !NA !NA
## 38 2018-12-12 Not Missing 17.2 12.6 22.3 0.5 !NA !NA !NA !NA
## 39 2018-12-12 Not Missing 17.2 12.6 22.3 0.5 !NA !NA !NA !NA
## 40 2018-12-13 Not Missing 16.4 11.2 21.6 1.0 !NA !NA !NA !NA
## 41 2018-12-13 Not Missing 16.4 11.2 21.6 1.0 !NA !NA !NA !NA
## 42 2018-12-14 Not Missing 15.7 8.0 21.4 0.0 !NA !NA !NA !NA
## 43 2018-12-14 Not Missing 15.7 8.0 21.4 0.0 !NA !NA !NA !NA
## 44 2019-01-06 Not Missing 13.9 8.6 21.2 2.0 !NA !NA !NA !NA
## 45 2019-01-06 Not Missing 13.9 8.6 21.2 2.0 !NA !NA !NA !NA
## 46 2019-01-21 Not Missing 16.4 11.5 28.7 3.0 !NA !NA !NA !NA
## 47 2019-01-21 Not Missing 16.4 11.5 28.7 3.0 !NA !NA !NA !NA
## 48 2019-01-22 Not Missing 14.5 13.6 22.6 15.0 !NA !NA !NA !NA
## 49 2019-01-22 Not Missing 14.5 13.6 22.6 15.0 !NA !NA !NA !NA
## 50 2019-01-23 Not Missing 13.8 8.0 19.4 27.9 !NA !NA !NA !NA
## 51 2019-01-23 Not Missing 13.8 8.0 19.4 27.9 !NA !NA !NA !NA
## 52 2019-01-24 Not Missing 14.8 9.7 21.0 0.0 !NA !NA !NA !NA
## 53 2019-01-24 Not Missing 14.8 9.7 21.0 0.0 !NA !NA !NA !NA
## 54 2019-01-25 Not Missing 13.4 11.0 22.0 6.1 !NA !NA !NA !NA
## 55 2019-01-25 Not Missing 13.4 11.0 22.0 6.1 !NA !NA !NA !NA
## 56 2019-01-26 Not Missing 12.2 6.0 19.0 0.0 !NA !NA !NA !NA
## 57 2019-01-26 Not Missing 12.2 6.0 19.0 0.0 !NA !NA !NA !NA
## 58 2019-02-01 Not Missing 16.3 11.0 22.0 0.0 !NA !NA !NA !NA
## 59 2019-02-01 Not Missing 16.3 11.0 22.0 0.0 !NA !NA !NA !NA
## 60 2019-02-02 Not Missing 12.9 8.2 22.5 0.0 !NA !NA !NA !NA
## 61 2019-02-02 Not Missing 12.9 8.2 22.5 0.0 !NA !NA !NA !NA
## 62 2019-02-06 Not Missing 19.5 12.0 25.0 0.8 !NA !NA !NA !NA
## 63 2019-02-06 Not Missing 19.5 12.0 25.0 0.8 !NA !NA !NA !NA
## 64 2019-02-07 Not Missing 16.5 13.6 25.1 0.0 !NA !NA !NA !NA
## 65 2019-02-07 Not Missing 16.5 13.6 25.1 0.0 !NA !NA !NA !NA
## 66 2019-02-08 Not Missing 14.9 7.0 20.4 5.1 !NA !NA !NA !NA
## 67 2019-02-08 Not Missing 14.9 7.0 20.4 5.1 !NA !NA !NA !NA
## 68 2019-02-09 Not Missing 14.8 8.4 20.6 0.0 !NA !NA !NA !NA
## 69 2019-02-09 Not Missing 14.8 8.4 20.6 0.0 !NA !NA !NA !NA
## 70 2019-02-14 Not Missing 17.7 13.9 25.3 1.0 !NA !NA !NA !NA
## 71 2019-02-14 Not Missing 17.7 13.9 25.3 1.0 !NA !NA !NA !NA
## 72 2019-02-15 Not Missing 17.6 11.8 23.0 9.9 !NA !NA !NA !NA
## 73 2019-02-15 Not Missing 17.6 11.8 23.0 9.9 !NA !NA !NA !NA
## 74 2019-02-16 Not Missing 16.0 13.3 22.0 0.0 !NA !NA !NA !NA
## 75 2019-02-16 Not Missing 16.0 13.3 22.0 0.0 !NA !NA !NA !NA
## 76 2019-02-19 Not Missing 18.7 12.0 25.8 2.0 !NA !NA !NA !NA
## 77 2019-02-19 Not Missing 18.7 12.0 25.8 2.0 !NA !NA !NA !NA
## 78 2019-02-20 Not Missing 18.3 14.5 24.5 2.0 !NA !NA !NA !NA
## 79 2019-02-20 Not Missing 18.3 14.5 24.5 2.0 !NA !NA !NA !NA
## 80 2019-02-21 Not Missing 21.6 13.2 28.0 0.8 !NA !NA !NA !NA
## 81 2019-02-21 Not Missing 21.6 13.2 28.0 0.8 !NA !NA !NA !NA
## 82 2019-02-22 Not Missing 19.1 15.4 28.1 0.0 !NA !NA !NA !NA
## 83 2019-02-22 Not Missing 19.1 15.4 28.1 0.0 !NA !NA !NA !NA
## 84 2019-02-26 Not Missing 15.3 10.3 25.4 1.0 !NA !NA !NA !NA
## 85 2019-02-26 Not Missing 15.3 10.3 25.4 1.0 !NA !NA !NA !NA
## 86 2019-02-27 Not Missing 14.3 9.5 21.0 0.0 !NA !NA !NA !NA
## 87 2019-03-02 Not Missing 15.2 12.6 24.1 0.5 !NA !NA !NA !NA
## 88 2019-03-02 Not Missing 15.2 12.6 24.1 0.5 !NA !NA !NA !NA
## 89 2019-03-03 Not Missing 17.1 12.8 22.4 9.9 !NA !NA !NA !NA
## 90 2019-03-03 Not Missing 17.1 12.8 22.4 9.9 !NA !NA !NA !NA
## 91 2019-03-04 Not Missing 19.1 11.2 24.0 0.3 !NA !NA !NA !NA
## 92 2019-03-04 Not Missing 19.1 11.2 24.0 0.3 !NA !NA !NA !NA
## 93 2019-03-05 Not Missing 18.4 12.0 23.9 0.0 !NA !NA !NA !NA
## 94 2019-03-05 Not Missing 18.4 12.0 23.9 0.0 !NA !NA !NA !NA
## 95 2019-03-09 Not Missing 20.6 12.6 26.5 0.0 !NA !NA !NA !NA
## 96 2019-03-09 Not Missing 20.6 12.6 26.5 0.0 !NA !NA !NA !NA
## 97 2019-03-15 Not Missing 20.2 10.4 26.0 0.5 !NA !NA !NA !NA
## 98 2019-03-15 Not Missing 20.2 10.4 26.0 0.5 !NA !NA !NA !NA
## 99 2019-03-16 Not Missing 21.6 12.5 26.0 0.0 !NA !NA !NA !NA
## 100 2019-03-16 Not Missing 21.6 12.5 26.0 0.0 !NA !NA !NA !NA
## 101 2019-03-31 Not Missing 28.8 21.0 39.2 0.3 !NA !NA !NA !NA
## 102 2019-04-01 Not Missing 27.7 17.6 34.3 0.0 !NA !NA !NA !NA
## 103 2019-04-06 Not Missing 33.6 22.4 39.3 0.0 !NA !NA !NA !NA
## 104 2019-04-07 Not Missing 30.8 24.0 37.8 6.1 !NA !NA !NA !NA
## 105 2019-04-12 Not Missing 30.6 23.2 38.7 0.5 !NA !NA !NA !NA
## 106 2019-04-13 Not Missing 31.4 21.6 38.0 0.3 !NA !NA !NA !NA
## 107 2019-04-14 Not Missing 32.2 21.8 38.0 0.0 !NA !NA !NA !NA
## 108 2019-04-16 Not Missing 26.0 20.5 40.0 1.0 !NA !NA !NA !NA
## 109 2019-04-17 Not Missing 24.3 19.0 30.7 1.0 !NA !NA !NA !NA
## 110 2019-04-18 Not Missing 25.8 17.1 31.0 0.5 !NA !NA !NA !NA
## 111 2019-04-19 Not Missing 27.5 18.2 32.0 0.0 !NA !NA !NA !NA
## 112 2019-05-03 Not Missing 32.6 24.0 41.0 0.0 !NA !NA !NA !NA
## 113 2019-05-04 Not Missing 32.1 22.4 39.5 4.1 !NA !NA !NA !NA
## 114 2019-05-11 Not Missing 33.9 24.4 41.2 0.0 !NA !NA !NA !NA
## 115 2019-05-14 Not Missing 30.8 20.8 40.2 3.0 !NA !NA !NA !NA
## 116 2019-05-15 Not Missing 29.4 23.0 35.0 0.0 !NA !NA !NA !NA
## 117 2019-05-16 Not Missing 30.6 23.4 36.4 2.0 !NA !NA !NA !NA
## 118 2019-05-17 Not Missing 29.6 21.6 37.0 0.0 !NA !NA !NA !NA
## 119 2019-05-18 Not Missing 28.7 19.5 37.4 10.9 !NA !NA !NA !NA
## 120 2019-05-19 Not Missing 31.9 23.2 37.0 0.0 !NA !NA !NA !NA
## 121 2019-05-24 Not Missing 29.8 23.8 40.4 6.1 !NA !NA !NA !NA
## 122 2019-05-25 Not Missing 32.5 24.5 37.0 0.0 !NA !NA !NA !NA
## 123 2019-06-16 Not Missing 31.5 28.4 43.4 0.0 !NA !NA !NA !NA
## 124 2019-06-17 Not Missing 30.3 25.6 36.3 0.5 !NA !NA !NA !NA
## 125 2019-06-18 Not Missing 28.5 20.6 35.0 10.9 !NA !NA !NA !NA
## 126 2019-06-19 Not Missing 32.2 24.2 37.0 0.0 !NA !NA !NA !NA
## 127 2019-06-22 Not Missing 35.6 27.2 40.0 0.0 !NA !NA !NA !NA
## 128 2019-07-04 Not Missing 30.9 28.6 39.4 0.0 !NA !NA !NA !NA
## 129 2019-07-05 Not Missing 28.8 26.9 38.6 1.0 !NA !NA !NA !NA
## 130 2019-07-05 Not Missing 28.8 26.9 38.6 1.0 !NA !NA !NA !NA
## 131 2019-07-06 Not Missing 33.4 26.5 37.0 24.9 !NA !NA !NA !NA
## 132 2019-07-06 Not Missing 33.4 26.5 37.0 24.9 !NA !NA !NA !NA
## 133 2019-07-07 Not Missing 32.1 27.6 37.0 0.0 !NA !NA !NA !NA
## 134 2019-07-07 Not Missing 32.1 27.6 37.0 0.0 !NA !NA !NA !NA
## 135 2019-07-08 Not Missing 33.7 28.6 36.0 0.0 !NA !NA !NA !NA
## 136 2019-07-08 Not Missing 33.7 28.6 36.0 0.0 !NA !NA !NA !NA
## 137 2019-07-17 Not Missing 27.0 24.0 33.4 22.1 !NA !NA !NA !NA
## 138 2019-07-17 Not Missing 27.0 24.0 33.4 22.1 !NA !NA !NA !NA
## 139 2019-07-18 Not Missing 27.0 23.5 31.7 11.9 !NA !NA !NA !NA
## 140 2019-07-18 Not Missing 27.0 23.5 31.7 11.9 !NA !NA !NA !NA
## 141 2019-07-19 Not Missing 31.9 25.0 36.0 4.1 !NA !NA !NA !NA
## 142 2019-07-19 Not Missing 31.9 25.0 36.0 4.1 !NA !NA !NA !NA
## 143 2019-07-20 Not Missing 30.7 26.0 36.2 0.0 !NA !NA !NA !NA
## 144 2019-07-20 Not Missing 30.7 26.0 36.2 0.0 !NA !NA !NA !NA
## 145 2019-07-21 Not Missing 30.6 25.6 36.7 8.9 !NA !NA !NA !NA
## 146 2019-07-21 Not Missing 30.6 25.6 36.7 8.9 !NA !NA !NA !NA
## 147 2019-07-22 Not Missing 31.3 24.6 36.5 50.0 !NA !NA !NA !NA
## 148 2019-07-22 Not Missing 31.3 24.6 36.5 50.0 !NA !NA !NA !NA
## 149 2019-07-23 Not Missing 34.2 28.2 37.0 4.1 !NA !NA !NA !NA
## 150 2019-07-23 Not Missing 34.2 28.2 37.0 4.1 !NA !NA !NA !NA
## 151 2019-07-24 Not Missing 33.3 27.8 38.0 0.0 !NA !NA !NA !NA
## 152 2019-07-24 Not Missing 33.3 27.8 38.0 0.0 !NA !NA !NA !NA
## 153 2019-07-25 Not Missing 28.7 25.0 37.8 21.1 !NA !NA !NA !NA
## 154 2019-07-25 Not Missing 28.7 25.0 37.8 21.1 !NA !NA !NA !NA
## 155 2019-07-26 Not Missing 28.6 26.2 32.0 0.5 !NA !NA !NA !NA
## 156 2019-07-26 Not Missing 28.6 26.2 32.0 0.5 !NA !NA !NA !NA
## 157 2019-07-27 Not Missing 28.7 25.4 32.5 10.9 !NA !NA !NA !NA
## 158 2019-07-27 Not Missing 28.7 25.4 32.5 10.9 !NA !NA !NA !NA
## 159 2019-07-28 Not Missing 30.1 25.8 34.0 3.0 !NA !NA !NA !NA
## 160 2019-07-28 Not Missing 30.1 25.8 34.0 3.0 !NA !NA !NA !NA
## 161 2019-07-29 Not Missing 31.9 27.9 36.0 6.1 !NA !NA !NA !NA
## 162 2019-07-29 Not Missing 31.9 27.9 36.0 6.1 !NA !NA !NA !NA
## 163 2019-07-30 Not Missing 31.5 27.4 36.1 0.0 !NA !NA !NA !NA
## 164 2019-07-30 Not Missing 31.5 27.4 36.1 0.0 !NA !NA !NA !NA
## 165 2019-07-31 Not Missing 31.9 27.8 35.1 0.0 !NA !NA !NA !NA
## 166 2019-07-31 Not Missing 31.9 27.8 35.1 0.0 !NA !NA !NA !NA
## 167 2019-08-06 Not Missing 27.2 24.0 37.4 11.9 !NA !NA !NA !NA
## 168 2019-08-06 Not Missing 27.2 24.0 37.4 11.9 !NA !NA !NA !NA
## 169 2019-08-07 Not Missing 30.9 25.6 34.0 22.1 !NA !NA !NA !NA
## 170 2019-08-07 Not Missing 30.9 25.6 34.0 22.1 !NA !NA !NA !NA
## 171 2019-08-08 Not Missing 33.5 27.5 38.0 0.0 !NA !NA !NA !NA
## 172 2019-08-08 Not Missing 33.5 27.5 38.0 0.0 !NA !NA !NA !NA
## 173 2019-08-10 Not Missing 31.1 27.2 34.0 0.0 !NA !NA !NA !NA
## 174 2019-08-10 Not Missing 31.1 27.2 34.0 0.0 !NA !NA !NA !NA
## 175 2019-08-11 Not Missing 32.9 27.6 36.0 0.0 !NA !NA !NA !NA
## 176 2019-08-11 Not Missing 32.9 27.6 36.0 0.0 !NA !NA !NA !NA
## 177 2019-08-12 Not Missing 31.1 28.0 36.2 0.3 !NA !NA !NA !NA
## 178 2019-08-12 Not Missing 31.1 28.0 36.2 0.3 !NA !NA !NA !NA
## 179 2019-08-13 Not Missing 30.4 27.8 35.0 2.0 !NA !NA !NA !NA
## 180 2019-08-13 Not Missing 30.4 27.8 35.0 2.0 !NA !NA !NA !NA
## 181 2019-08-14 Not Missing 30.5 25.6 35.5 10.9 !NA !NA !NA !NA
## 182 2019-08-14 Not Missing 30.5 25.6 35.5 10.9 !NA !NA !NA !NA
## 183 2019-08-15 Not Missing 30.6 24.8 34.6 10.9 !NA !NA !NA !NA
## 184 2019-08-15 Not Missing 30.6 24.8 34.6 10.9 !NA !NA !NA !NA
## 185 2019-08-16 Not Missing 29.9 26.6 34.8 0.0 !NA !NA !NA !NA
## 186 2019-08-16 Not Missing 29.9 26.6 34.8 0.0 !NA !NA !NA !NA
## 187 2019-08-17 Not Missing 26.9 25.2 32.5 7.1 !NA !NA !NA !NA
## 188 2019-08-17 Not Missing 26.9 25.2 32.5 7.1 !NA !NA !NA !NA
## 189 2019-08-18 Not Missing 27.3 24.8 29.6 46.0 !NA !NA !NA !NA
## 190 2019-08-18 Not Missing 27.3 24.8 29.6 46.0 !NA !NA !NA !NA
## 191 2019-08-19 Not Missing 30.7 23.6 34.0 2.0 !NA !NA !NA !NA
## 192 2019-08-19 Not Missing 30.7 23.6 34.0 2.0 !NA !NA !NA !NA
## 193 2019-08-20 Not Missing 31.4 25.6 35.0 0.0 !NA !NA !NA !NA
## 194 2019-08-20 Not Missing 31.4 25.6 35.0 0.0 !NA !NA !NA !NA
## 195 2019-08-21 Not Missing 32.0 24.8 36.0 0.0 !NA !NA !NA !NA
## 196 2019-08-21 Not Missing 32.0 24.8 36.0 0.0 !NA !NA !NA !NA
## 197 2019-08-23 Not Missing 31.8 24.6 36.0 0.0 !NA !NA !NA !NA
## 198 2019-08-23 Not Missing 31.8 24.6 36.0 0.0 !NA !NA !NA !NA
## 199 2019-08-25 Not Missing 29.3 26.8 33.8 0.3 !NA !NA !NA !NA
## 200 2019-08-25 Not Missing 29.3 26.8 33.8 0.3 !NA !NA !NA !NA
## 201 2019-08-26 Not Missing 31.9 25.7 35.0 3.0 !NA !NA !NA !NA
## 202 2019-08-26 Not Missing 31.9 25.7 35.0 3.0 !NA !NA !NA !NA
## 203 2019-08-27 Not Missing 32.9 27.4 36.4 0.0 !NA !NA !NA !NA
## 204 2019-08-27 Not Missing 32.9 27.4 36.4 0.0 !NA !NA !NA !NA
## 205 2019-09-15 Not Missing 30.7 27.3 35.2 2.0 !NA !NA !NA !NA
## 206 2019-09-15 Not Missing 30.7 27.3 35.2 2.0 !NA !NA !NA !NA
## 207 2019-09-16 Not Missing 32.3 27.1 36.0 0.0 !NA !NA !NA !NA
## 208 2019-09-16 Not Missing 32.3 27.1 36.0 0.0 !NA !NA !NA !NA
## 209 2019-09-17 Not Missing 32.3 27.5 36.4 5.1 !NA !NA !NA !NA
## 210 2019-09-17 Not Missing 32.3 27.5 36.4 5.1 !NA !NA !NA !NA
## 211 2019-09-18 Not Missing 30.9 26.2 36.8 0.3 !NA !NA !NA !NA
## 212 2019-09-18 Not Missing 30.9 26.2 36.8 0.3 !NA !NA !NA !NA
## 213 2019-09-19 Not Missing 29.1 25.1 35.1 0.0 !NA !NA !NA !NA
## 214 2019-09-19 Not Missing 29.1 25.1 35.1 0.0 !NA !NA !NA !NA
## 215 2019-09-20 Not Missing 30.8 24.9 35.0 0.3 !NA !NA !NA !NA
## 216 2019-09-20 Not Missing 30.8 24.9 35.0 0.3 !NA !NA !NA !NA
## 217 2019-09-21 Not Missing 28.0 24.0 35.3 0.0 !NA !NA !NA !NA
## 218 2019-09-21 Not Missing 28.0 24.0 35.3 0.0 !NA !NA !NA !NA
## 219 2019-09-22 Not Missing 27.2 23.7 35.2 33.0 !NA !NA !NA !NA
## 220 2019-09-22 Not Missing 27.2 23.7 35.2 33.0 !NA !NA !NA !NA
## 221 2019-09-23 Not Missing 29.1 24.0 32.0 0.0 !NA !NA !NA !NA
## 222 2019-09-23 Not Missing 29.1 24.0 32.0 0.0 !NA !NA !NA !NA
## 223 2019-09-24 Not Missing 29.4 23.9 34.0 0.0 !NA !NA !NA !NA
## 224 2019-09-24 Not Missing 29.4 23.9 34.0 0.0 !NA !NA !NA !NA
## 225 2019-09-29 Not Missing 27.7 24.2 33.0 0.3 !NA !NA !NA !NA
## 226 2019-09-29 Not Missing 27.7 24.2 33.0 0.3 !NA !NA !NA !NA
## 227 2019-09-30 Not Missing 27.4 23.8 31.2 0.0 !NA !NA !NA !NA
## 228 2019-09-30 Not Missing 27.4 23.8 31.2 0.0 !NA !NA !NA !NA
## 229 2019-10-01 Not Missing 28.7 23.8 32.4 0.0 !NA !NA !NA !NA
## 230 2019-10-01 Not Missing 28.7 23.8 32.4 0.0 !NA !NA !NA !NA
## 231 2019-10-02 Not Missing 29.0 23.4 33.0 2.0 !NA !NA !NA !NA
## 232 2019-10-03 Not Missing 28.8 21.4 34.4 0.0 !NA !NA !NA !NA
## 233 2019-10-04 Not Missing 27.2 19.8 34.8 47.0 !NA !NA !NA !NA
## 234 2019-10-04 Not Missing 27.2 19.8 34.8 47.0 !NA !NA !NA !NA
## 235 2019-10-05 Not Missing 27.3 21.0 32.0 0.0 !NA !NA !NA !NA
## 236 2019-10-05 Not Missing 27.3 21.0 32.0 0.0 !NA !NA !NA !NA
## 237 2019-10-19 Not Missing 27.3 19.8 34.7 0.0 !NA !NA !NA !NA
## 238 2019-10-19 Not Missing 27.3 19.8 34.7 0.0 !NA !NA !NA !NA
## 239 2019-11-03 Not Missing 23.4 18.7 29.5 0.3 !NA !NA !NA !NA
## 240 2019-11-03 Not Missing 23.4 18.7 29.5 0.3 !NA !NA !NA !NA
## 241 2019-11-04 Not Missing 24.1 18.9 29.4 0.0 !NA !NA !NA !NA
## 242 2019-11-04 Not Missing 24.1 18.9 29.4 0.0 !NA !NA !NA !NA
## 243 2019-11-26 Not Missing 21.6 16.2 27.2 0.0 !NA !NA !NA !NA
## 244 2019-11-26 Not Missing 21.6 16.2 27.2 0.0 !NA !NA !NA !NA
## 245 2019-11-28 Not Missing 18.3 16.2 27.4 0.0 !NA !NA !NA !NA
## 246 2019-11-28 Not Missing 18.3 16.2 27.4 0.0 !NA !NA !NA !NA
## 247 2019-11-29 Not Missing 18.1 12.5 24.6 0.0 !NA !NA !NA !NA
## 248 2019-11-29 Not Missing 18.1 12.5 24.6 0.0 !NA !NA !NA !NA
## 249 2019-12-13 Not Missing 15.2 12.8 21.5 34.0 !NA !NA !NA !NA
## 250 2019-12-13 Not Missing 15.2 12.8 21.5 34.0 !NA !NA !NA !NA
## 251 2019-12-14 Not Missing 14.7 9.7 18.7 0.5 !NA !NA !NA !NA
## 252 2019-12-14 Not Missing 14.7 9.7 18.7 0.5 !NA !NA !NA !NA
## 253 2019-12-15 Not Missing 14.2 9.8 19.2 0.0 !NA !NA !NA !NA
## 254 2019-12-15 Not Missing 14.2 9.8 19.2 0.0 !NA !NA !NA !NA
## 255 2020-01-07 Not Missing 15.0 11.6 19.1 0.0 !NA !NA !NA !NA
## 256 2020-01-08 Not Missing 13.2 11.9 19.0 6.1 !NA !NA !NA !NA
## 257 2020-01-08 Not Missing 13.2 11.9 19.0 6.1 !NA !NA !NA !NA
## 258 2020-01-09 Not Missing 10.4 8.4 14.6 10.9 !NA !NA !NA !NA
## 259 2020-01-09 Not Missing 10.4 8.4 14.6 10.9 !NA !NA !NA !NA
## 260 2020-01-10 Not Missing 10.7 5.3 17.4 0.0 !NA !NA !NA !NA
## 261 2020-01-10 Not Missing 10.7 5.3 17.4 0.0 !NA !NA !NA !NA
## 262 2020-01-16 Not Missing 13.2 9.5 19.9 0.0 !NA !NA !NA !NA
## 263 2020-01-16 Not Missing 13.2 9.5 19.9 0.0 !NA !NA !NA !NA
## 264 2020-01-17 Not Missing 13.2 11.2 17.4 6.1 !NA !NA !NA !NA
## 265 2020-01-17 Not Missing 13.2 11.2 17.4 6.1 !NA !NA !NA !NA
## 266 2020-01-18 Not Missing 12.3 8.4 17.7 0.0 !NA !NA !NA !NA
## 267 2020-01-18 Not Missing 12.3 8.4 17.7 0.0 !NA !NA !NA !NA
## prcp_NA StationId PM2.5 PM10 NO NO2 NOx NH3 CO SO2 O3
## 1 !NA DL019 48.03 89.10 4.09 39.86 24.30 18.91 0.68 12.71 9.14
## 2 !NA DL019 23.98 38.46 3.64 34.88 21.51 26.11 0.52 11.41 6.42
## 3 !NA DL019 34.77 60.62 11.30 53.24 37.53 38.76 0.75 9.87 11.59
## 4 !NA DL019 42.65 113.91 5.90 50.46 31.40 21.05 0.91 12.43 10.90
## 5 !NA DL019 44.09 138.82 2.30 34.02 19.90 27.99 0.56 7.27 13.37
## 6 !NA DL019 48.80 110.93 8.11 29.32 22.19 35.32 0.53 10.14 15.38
## 7 !NA DL019 59.80 114.52 1.66 32.95 18.81 28.86 0.60 10.64 19.23
## 8 !NA DL019 49.42 101.33 4.94 38.88 24.63 22.11 0.61 11.04 18.74
## 9 !NA DL019 40.46 94.12 9.68 32.98 25.43 23.14 0.50 10.86 10.50
## 10 !NA DL019 33.39 85.74 5.23 32.80 21.71 23.95 0.51 10.98 9.65
## 11 !NA DL019 35.95 90.71 17.23 46.91 39.00 30.17 0.70 11.83 8.67
## 12 !NA DL019 41.92 90.96 20.37 74.41 56.11 49.97 0.64 10.83 12.26
## 13 !NA DL019 40.77 108.08 21.07 68.90 53.83 56.35 0.52 10.22 17.89
## 14 !NA DL019 36.03 99.81 13.89 64.51 45.63 49.08 0.48 13.14 11.81
## 15 !NA DL019 39.06 88.27 6.45 73.05 44.00 30.64 0.72 13.22 9.06
## 16 !NA DL019 47.97 97.82 17.46 64.82 47.08 43.67 0.63 17.02 9.26
## 17 !NA DL019 22.64 59.55 3.46 33.60 20.66 21.08 0.87 17.24 10.32
## 18 !NA DL019 27.32 79.45 6.11 31.43 21.70 21.78 0.69 12.70 10.95
## 19 !NA DL019 40.11 101.69 7.81 29.34 21.88 33.71 0.78 12.22 11.37
## 20 !NA DL019 27.56 76.45 8.17 27.78 21.44 44.72 0.70 11.84 8.60
## 21 !NA DL019 18.45 58.76 4.48 30.09 19.66 43.76 0.70 11.46 8.75
## 22 !NA DL019 32.40 88.77 4.75 26.28 17.83 42.84 0.68 12.00 10.14
## 23 !NA DL019 41.42 130.54 5.49 24.50 17.49 41.32 0.66 11.67 10.44
## 24 !NA DL019 37.15 129.03 4.97 23.76 16.65 39.42 0.63 11.74 11.72
## 25 !NA DL019 70.14 177.67 7.05 36.56 25.19 40.30 0.67 12.47 10.18
## 26 !NA DL019 58.59 152.86 9.93 31.35 24.77 41.15 0.70 11.59 9.32
## 27 !NA DL019 47.34 128.71 14.22 34.28 29.81 41.15 0.69 11.42 16.40
## 28 !NA DL019 36.09 96.95 13.76 32.23 28.36 44.02 0.70 9.82 10.43
## 29 !NA DL019 32.29 98.68 9.28 32.25 24.72 44.54 0.53 11.67 18.31
## 30 !NA DL019 154.65 249.46 16.98 61.42 46.51 41.21 1.80 4.86 15.92
## 31 !NA DL019 93.58 145.83 3.81 45.51 27.30 40.69 1.64 5.53 30.92
## 32 !NA DL019 134.31 177.53 54.66 67.76 80.62 47.55 2.80 2.89 6.16
## 33 !NA DL001 150.41 214.16 6.35 96.17 56.30 36.02 0.99 12.74 41.87
## 34 !NA DL001 106.11 154.36 7.36 77.52 47.21 30.45 0.70 12.46 27.93
## 35 !NA DL019 97.99 137.40 24.49 57.58 50.50 51.82 2.28 2.85 10.66
## 36 !NA DL001 146.23 211.34 2.01 60.29 33.65 28.52 0.71 11.64 46.12
## 37 !NA DL019 134.32 171.98 5.22 57.42 34.80 53.13 2.23 3.64 13.95
## 38 !NA DL019 209.72 255.19 48.14 45.82 63.38 6.79 1.62 3.95 22.71
## 39 !NA DL001 210.33 290.50 7.68 73.71 45.47 33.01 1.21 21.13 20.55
## 40 !NA DL019 86.50 118.83 10.07 35.82 27.17 7.81 1.12 4.75 9.87
## 41 !NA DL001 78.00 115.42 5.47 61.26 37.06 24.49 0.57 18.57 18.86
## 42 !NA DL001 98.81 158.25 9.09 68.82 44.01 26.74 0.59 18.66 18.43
## 43 !NA DL019 101.20 133.30 8.75 49.72 31.25 32.18 1.14 3.50 18.23
## 44 !NA DL001 150.23 196.81 2.24 91.81 50.66 44.10 0.87 22.62 23.96
## 45 !NA DL019 168.74 187.08 28.77 112.54 83.32 54.21 0.74 5.32 21.11
## 46 !NA DL001 191.15 321.77 6.67 95.19 56.02 44.42 1.19 26.78 27.61
## 47 !NA DL019 147.88 207.89 67.89 73.26 94.31 36.03 0.79 7.33 6.20
## 48 !NA DL001 39.77 46.11 2.14 52.05 29.42 27.64 0.45 21.19 30.98
## 49 !NA DL019 55.65 85.27 10.90 62.96 42.36 34.35 0.90 6.16 7.20
## 50 !NA DL001 104.21 162.48 39.24 56.00 61.83 26.57 0.98 25.83 20.15
## 51 !NA DL019 146.05 195.46 59.46 52.54 76.42 37.90 1.60 8.21 6.30
## 52 !NA DL001 102.38 160.79 8.33 50.07 33.41 28.12 0.71 21.38 25.92
## 53 !NA DL019 119.33 176.41 33.24 54.55 56.11 32.59 1.19 8.01 14.91
## 54 !NA DL001 76.20 113.33 2.76 53.11 30.51 27.75 0.50 31.74 28.56
## 55 !NA DL019 72.98 123.75 7.71 57.39 36.81 29.03 0.93 7.45 13.34
## 56 !NA DL019 89.44 139.14 4.02 56.49 33.31 26.86 0.83 7.12 13.92
## 57 !NA DL001 88.94 135.61 2.73 45.81 26.61 26.01 0.59 23.34 32.51
## 58 !NA DL001 191.46 252.49 2.51 85.07 47.28 49.65 1.04 27.00 28.53
## 59 !NA DL019 166.52 217.06 15.66 69.95 49.96 37.94 1.05 6.26 4.75
## 60 !NA DL001 172.49 244.02 2.52 61.92 34.99 43.08 0.79 21.83 23.07
## 61 !NA DL019 185.62 236.47 4.23 49.53 29.80 39.63 1.15 6.56 4.53
## 62 !NA DL001 213.33 311.04 17.63 85.95 60.09 45.73 1.63 27.71 24.50
## 63 !NA DL019 188.02 283.68 56.11 58.54 76.89 40.28 1.09 8.03 7.73
## 64 !NA DL001 70.52 101.02 1.82 54.99 30.69 30.62 0.59 23.19 28.01
## 65 !NA DL019 65.24 132.07 4.34 37.01 23.22 30.98 0.81 6.75 11.12
## 66 !NA DL019 63.20 155.56 18.80 37.41 35.22 22.11 0.92 7.22 13.03
## 67 !NA DL001 65.52 126.50 25.78 44.60 44.75 21.79 0.62 26.44 23.73
## 68 !NA DL001 77.25 140.38 4.53 43.58 26.88 23.22 0.53 26.43 30.43
## 69 !NA DL019 68.77 159.78 6.96 35.30 24.45 21.50 0.87 8.52 13.42
## 70 !NA DL019 159.43 244.48 38.12 60.53 63.27 31.33 1.19 8.34 5.36
## 71 !NA DL001 174.76 246.32 13.33 81.38 54.15 36.65 1.39 28.40 31.81
## 72 !NA DL001 120.54 157.00 2.75 55.38 31.69 28.87 0.87 31.67 29.82
## 73 !NA DL019 148.94 231.57 10.78 50.74 35.77 31.36 0.99 8.43 6.65
## 74 !NA DL019 117.91 193.99 3.10 35.12 21.21 27.14 0.98 7.52 6.97
## 75 !NA DL001 106.16 155.68 2.63 46.76 27.03 25.11 0.66 28.29 24.06
## 76 !NA DL001 125.07 192.40 7.38 64.35 40.32 30.37 0.91 24.55 43.45
## 77 !NA DL019 126.11 237.02 30.65 47.81 50.39 29.05 1.38 7.56 9.69
## 78 !NA DL019 108.52 215.71 41.71 48.71 59.91 28.32 1.01 9.43 14.43
## 79 !NA DL001 87.22 123.58 3.12 56.47 32.55 27.17 0.76 21.91 19.45
## 80 !NA DL001 68.33 142.60 5.38 56.59 34.46 22.83 0.73 27.83 27.84
## 81 !NA DL019 63.25 171.69 37.30 43.02 53.29 25.03 1.14 8.14 7.82
## 82 !NA DL019 74.88 159.88 16.65 30.72 29.91 25.59 0.85 6.65 8.39
## 83 !NA DL001 65.97 118.76 5.54 50.62 31.44 25.23 0.53 22.70 29.35
## 84 !NA DL001 52.27 82.18 2.21 43.15 24.71 16.79 0.45 26.89 35.16
## 85 !NA DL019 56.93 133.00 16.89 57.14 44.14 21.03 1.00 8.73 8.54
## 86 !NA DL001 42.78 68.55 3.01 36.94 22.10 14.80 0.40 22.35 21.55
## 87 !NA DL019 91.84 176.40 43.90 63.85 69.74 18.87 1.11 9.27 4.59
## 88 !NA DL001 138.18 223.32 20.19 71.38 54.39 24.18 0.99 29.02 9.77
## 89 !NA DL001 61.74 81.40 2.16 40.92 23.53 19.72 0.54 25.56 38.38
## 90 !NA DL019 61.68 115.89 16.01 44.02 36.45 19.22 0.94 6.95 7.99
## 91 !NA DL001 76.20 114.85 10.49 47.07 33.59 20.26 0.65 26.95 28.00
## 92 !NA DL019 105.91 188.61 51.91 31.14 58.84 21.19 1.05 7.07 32.26
## 93 !NA DL001 71.82 135.15 5.63 43.70 27.84 19.20 0.51 26.19 33.92
## 94 !NA DL019 63.73 132.50 5.30 34.74 22.69 17.55 0.89 6.88 44.88
## 95 !NA DL001 65.60 149.44 11.06 50.46 35.84 20.33 0.51 32.84 38.21
## 96 !NA DL019 65.38 163.77 9.81 42.77 30.70 15.86 0.86 9.43 9.38
## 97 !NA DL019 72.04 159.52 2.92 36.21 21.12 20.37 0.83 9.15 23.64
## 98 !NA DL001 59.90 119.92 21.80 56.28 47.72 21.08 0.51 25.61 36.05
## 99 !NA DL001 60.85 114.67 1.85 52.28 29.31 20.44 0.44 27.45 50.86
## 100 !NA DL019 63.29 149.71 5.00 41.93 26.36 18.62 0.73 8.34 18.36
## 101 !NA DL019 48.68 196.27 4.82 41.01 25.74 20.79 0.63 10.52 22.62
## 102 !NA DL019 52.56 184.04 4.54 42.27 25.74 19.63 0.85 10.67 25.11
## 103 !NA DL019 79.36 293.61 24.63 54.42 49.03 24.12 1.05 10.76 20.50
## 104 !NA DL019 143.06 348.88 36.67 52.61 57.76 29.21 1.12 10.10 23.44
## 105 !NA DL019 48.56 230.41 8.45 45.13 30.50 22.73 0.90 9.62 20.94
## 106 !NA DL019 40.45 169.65 5.22 40.98 26.04 23.85 0.72 10.07 19.98
## 107 !NA DL019 42.68 183.83 7.75 48.28 31.99 21.72 0.84 11.92 18.62
## 108 !NA DL019 66.44 168.85 9.90 43.25 31.07 22.89 0.84 7.58 18.10
## 109 !NA DL019 27.12 76.50 4.31 36.48 22.90 20.31 0.67 7.68 20.07
## 110 !NA DL019 38.28 111.35 5.60 41.57 26.29 20.85 0.77 9.11 23.55
## 111 !NA DL019 63.06 144.54 16.82 50.42 40.53 20.87 0.88 11.75 21.30
## 112 !NA DL019 88.20 195.73 10.97 54.53 36.33 30.18 0.81 10.69 21.82
## 113 !NA DL019 103.65 234.47 42.04 68.22 67.57 35.99 0.74 12.78 17.04
## 114 !NA DL019 101.77 293.29 10.79 42.76 31.16 20.02 0.87 11.65 24.60
## 115 !NA DL019 57.80 131.08 9.73 43.60 28.85 17.94 0.83 8.31 24.87
## 116 !NA DL019 85.39 208.42 53.08 70.94 80.64 27.12 1.15 8.95 18.56
## 117 !NA DL019 76.67 220.00 39.28 64.30 62.63 30.79 0.95 8.41 20.23
## 118 !NA DL019 78.33 344.09 37.56 56.10 55.83 23.64 0.87 7.59 20.45
## 119 !NA DL019 43.61 123.89 7.87 29.20 18.74 28.60 0.74 7.04 24.10
## 120 !NA DL019 83.21 209.58 8.41 38.00 26.96 25.14 0.83 8.14 28.73
## 121 !NA DL019 44.15 127.02 18.11 31.04 30.90 40.73 0.79 9.83 19.49
## 122 !NA DL019 35.08 121.60 5.12 32.40 21.44 26.51 0.85 12.37 26.73
## 123 !NA DL019 68.79 172.15 4.91 25.83 17.74 46.24 0.70 7.92 24.92
## 124 !NA DL019 64.84 153.50 5.63 30.68 20.88 42.71 0.79 7.95 22.25
## 125 !NA DL019 34.55 94.30 16.34 44.70 35.00 28.79 0.86 6.04 19.00
## 126 !NA DL019 93.17 241.93 20.11 51.61 43.85 27.04 0.61 8.18 26.65
## 127 !NA DL019 63.18 186.10 9.69 73.96 46.90 21.34 0.77 8.09 32.03
## 128 !NA DL019 45.21 118.69 11.17 35.18 27.56 14.99 0.87 7.36 17.54
## 129 !NA DL001 47.45 75.03 2.85 49.05 28.41 57.51 0.66 23.86 24.61
## 130 !NA DL019 64.39 135.59 21.94 39.47 38.85 16.95 0.81 7.58 15.18
## 131 !NA DL001 42.46 74.21 3.21 51.40 29.99 54.50 0.61 21.37 39.84
## 132 !NA DL019 20.55 103.28 8.98 35.52 26.22 17.80 0.81 6.94 18.97
## 133 !NA DL001 43.80 70.18 1.11 32.24 18.05 55.02 0.53 23.36 45.17
## 134 !NA DL019 34.34 107.30 6.22 32.23 22.20 15.30 0.64 7.03 21.85
## 135 !NA DL001 54.62 140.25 2.16 42.45 24.34 36.26 0.64 18.98 44.54
## 136 !NA DL019 45.93 169.15 7.54 40.61 27.74 15.03 0.80 7.73 20.40
## 137 !NA DL001 24.15 36.65 1.12 23.82 13.61 17.10 0.88 10.79 37.05
## 138 !NA DL019 38.59 92.92 6.26 39.53 25.99 20.62 0.91 4.35 16.46
## 139 !NA DL001 32.23 57.92 1.42 34.34 19.42 19.36 0.76 11.85 42.35
## 140 !NA DL019 33.97 80.83 15.40 49.26 36.63 21.24 1.06 4.28 16.92
## 141 !NA DL001 71.08 138.35 2.90 42.43 24.95 23.11 1.04 20.86 74.89
## 142 !NA DL019 63.72 146.20 54.57 48.09 67.12 21.17 1.12 3.95 20.31
## 143 !NA DL001 43.86 84.22 1.66 47.05 26.47 28.09 0.67 11.64 22.63
## 144 !NA DL019 58.94 131.49 11.36 57.36 39.70 22.36 1.00 3.49 14.91
## 145 !NA DL001 55.20 85.48 2.14 43.25 24.72 17.93 2.15 11.15 19.66
## 146 !NA DL019 58.51 111.60 22.58 46.04 42.73 27.93 0.85 3.76 23.57
## 147 !NA DL019 69.38 122.27 52.13 43.53 65.52 34.12 1.12 3.71 17.17
## 148 !NA DL001 53.82 80.07 1.96 46.47 26.17 29.33 1.49 10.42 24.31
## 149 !NA DL019 89.95 160.75 46.38 57.51 68.24 31.60 1.44 3.93 28.22
## 150 !NA DL001 49.88 77.04 1.49 67.93 37.33 34.69 2.15 9.93 21.56
## 151 !NA DL001 65.75 152.15 2.70 68.72 38.75 44.05 0.98 11.27 30.19
## 152 !NA DL019 79.58 145.19 33.93 54.49 56.51 28.33 0.87 3.94 17.12
## 153 !NA DL001 19.18 44.81 1.72 55.82 31.07 63.55 0.73 10.20 17.24
## 154 !NA DL019 30.30 73.56 11.44 40.74 30.17 22.51 0.89 3.91 15.14
## 155 !NA DL001 34.31 43.86 1.33 41.42 23.12 32.85 0.61 12.76 26.60
## 156 !NA DL019 40.11 85.79 13.89 47.02 36.33 26.32 0.88 3.28 13.14
## 157 !NA DL019 33.53 79.10 13.98 40.59 32.16 25.13 0.88 3.75 14.18
## 158 !NA DL001 24.82 20.09 1.21 26.91 15.30 21.83 0.62 10.06 24.13
## 159 !NA DL001 21.00 23.73 1.25 27.88 15.86 22.64 0.57 11.31 21.73
## 160 !NA DL019 21.00 69.27 6.18 30.82 21.16 20.55 0.64 3.80 11.30
## 161 !NA DL019 19.85 73.21 5.14 32.82 21.45 17.52 0.67 3.64 12.95
## 162 !NA DL001 20.23 30.11 1.21 36.57 20.70 29.44 0.59 12.79 24.47
## 163 !NA DL001 24.44 36.65 1.31 45.32 25.17 34.19 0.69 11.29 22.74
## 164 !NA DL019 27.57 89.08 6.04 37.27 24.49 18.86 0.77 3.60 12.81
## 165 !NA DL019 26.96 88.10 4.67 36.28 23.01 19.32 0.91 4.02 13.46
## 166 !NA DL001 26.58 43.81 1.13 39.98 22.19 30.86 0.56 11.51 27.06
## 167 !NA DL001 50.96 77.29 4.98 70.76 41.71 46.58 0.94 10.66 6.76
## 168 !NA DL019 26.39 73.99 23.17 49.17 44.14 30.47 0.93 4.06 9.84
## 169 !NA DL001 24.22 29.95 3.06 48.51 28.30 32.54 0.55 12.30 18.95
## 170 !NA DL019 30.69 92.73 25.54 39.67 40.61 21.75 1.10 5.11 15.49
## 171 !NA DL019 18.32 86.28 11.46 42.90 32.05 16.76 0.94 6.31 19.06
## 172 !NA DL001 22.54 34.75 1.51 32.67 18.61 24.72 0.59 12.84 29.70
## 173 !NA DL001 18.38 27.38 1.25 24.00 13.77 20.41 1.11 12.44 34.80
## 174 !NA DL019 14.42 68.50 4.87 30.48 20.15 16.87 0.68 3.54 11.09
## 175 !NA DL001 31.00 54.17 1.33 39.64 23.15 29.22 2.07 13.10 35.91
## 176 !NA DL019 26.24 80.58 4.96 33.22 21.65 16.69 0.74 4.21 13.16
## 177 !NA DL019 17.52 72.61 4.53 26.55 17.57 16.81 0.79 3.68 12.75
## 178 !NA DL001 23.12 51.64 1.75 47.33 26.60 29.90 1.17 12.01 22.06
## 179 !NA DL019 20.46 83.27 14.22 28.45 26.71 16.21 0.85 3.57 11.34
## 180 !NA DL001 22.85 42.51 3.28 47.21 27.77 30.39 1.17 10.98 16.34
## 181 !NA DL019 42.35 93.96 9.75 32.56 25.20 18.26 0.83 3.21 15.71
## 182 !NA DL001 24.55 30.97 1.67 38.25 21.71 27.03 0.98 11.73 30.61
## 183 !NA DL019 31.03 84.30 11.87 24.54 22.73 17.06 0.74 3.19 11.78
## 184 !NA DL001 20.74 34.86 1.94 35.06 20.23 27.33 0.74 10.46 23.76
## 185 !NA DL001 12.55 18.06 1.01 18.55 10.69 17.01 0.61 10.65 30.45
## 186 !NA DL019 16.71 68.16 6.19 26.68 19.19 14.44 0.74 3.55 10.76
## 187 !NA DL019 14.86 49.05 8.84 20.54 18.09 13.03 0.68 3.71 9.75
## 188 !NA DL001 10.06 10.75 1.35 17.53 10.43 14.90 0.66 10.06 16.16
## 189 !NA DL019 19.09 53.43 8.26 18.77 16.69 13.16 0.74 3.91 11.67
## 190 !NA DL001 12.17 11.54 1.31 17.46 10.37 14.89 0.76 10.43 13.92
## 191 !NA DL019 39.69 101.10 24.08 24.40 32.49 13.19 0.99 3.17 14.82
## 192 !NA DL001 35.17 52.60 3.79 26.92 17.40 21.70 0.81 10.50 21.74
## 193 !NA DL019 41.58 103.01 14.88 33.98 29.26 17.01 0.87 2.73 12.85
## 194 !NA DL001 34.21 65.97 4.93 34.44 22.34 28.25 0.77 11.47 23.82
## 195 !NA DL019 47.48 133.83 8.92 42.71 29.99 23.84 0.76 2.35 15.48
## 196 !NA DL001 45.26 106.77 3.22 37.21 22.41 26.20 0.79 10.62 15.46
## 197 !NA DL019 50.69 124.96 49.95 32.06 57.44 23.93 0.94 3.07 12.16
## 198 !NA DL001 54.67 126.70 23.44 65.71 54.08 33.30 1.15 11.17 18.10
## 199 !NA DL001 52.11 79.86 1.87 48.46 27.36 26.43 0.92 12.09 19.87
## 200 !NA DL019 31.08 79.48 17.10 26.83 28.21 21.81 0.85 4.12 9.60
## 201 !NA DL019 23.94 72.95 23.47 35.11 37.81 21.00 0.87 3.38 11.77
## 202 !NA DL001 15.35 30.65 1.39 30.86 17.55 19.04 0.62 10.87 19.69
## 203 !NA DL001 29.75 52.36 2.50 26.98 16.16 18.98 0.71 11.11 18.22
## 204 !NA DL019 35.19 98.52 23.78 39.56 40.32 20.89 0.95 3.67 15.01
## 205 !NA DL019 39.41 91.71 10.58 31.13 25.11 14.49 0.88 3.95 17.75
## 206 !NA DL001 28.75 47.73 1.30 25.54 14.64 18.41 0.61 9.42 19.78
## 207 !NA DL019 53.96 121.30 23.45 29.47 34.59 15.10 1.02 4.77 21.23
## 208 !NA DL001 49.98 100.67 6.72 36.49 24.88 27.82 0.83 10.02 36.94
## 209 !NA DL019 57.39 129.73 11.07 35.22 27.72 13.47 1.00 4.35 19.99
## 210 !NA DL001 52.58 104.31 1.97 34.85 20.14 26.68 0.74 10.25 44.14
## 211 !NA DL001 32.61 71.81 1.23 30.62 17.29 21.07 0.56 13.55 41.14
## 212 !NA DL019 37.29 107.50 4.22 36.34 22.70 11.27 0.73 4.92 16.54
## 213 !NA DL019 20.60 76.90 3.95 27.00 17.56 9.84 0.75 4.30 15.21
## 214 !NA DL001 17.17 40.71 1.03 20.78 11.90 16.81 0.50 13.76 30.34
## 215 !NA DL019 30.98 88.04 8.97 27.10 21.71 9.32 0.78 4.61 14.38
## 216 !NA DL001 38.12 78.35 1.71 30.01 17.35 21.57 0.62 16.35 39.36
## 217 !NA DL001 60.40 103.46 5.39 37.89 24.58 27.24 0.87 14.04 33.37
## 218 !NA DL019 33.33 90.01 11.16 21.81 20.71 8.78 0.80 5.08 16.26
## 219 !NA DL019 22.10 73.99 14.84 22.41 24.01 7.35 0.73 4.64 14.03
## 220 !NA DL001 31.81 52.50 4.99 47.46 29.53 30.15 0.56 10.32 20.23
## 221 !NA DL001 26.15 46.15 1.26 36.90 20.66 24.37 0.69 12.32 31.13
## 222 !NA DL019 35.91 102.47 8.79 25.26 20.57 8.01 0.80 3.98 16.60
## 223 !NA DL001 28.42 58.85 4.64 39.32 24.71 23.01 0.84 9.78 21.28
## 224 !NA DL019 36.50 106.48 10.67 25.92 22.47 7.92 0.91 4.65 16.18
## 225 !NA DL001 16.20 30.37 1.33 21.24 12.38 18.27 0.65 14.19 19.78
## 226 !NA DL019 19.81 48.44 9.93 40.81 29.72 14.96 0.55 2.84 18.30
## 227 !NA DL001 30.50 60.81 1.85 24.73 14.66 17.77 0.79 13.56 34.69
## 228 !NA DL019 33.04 81.04 15.44 48.84 37.98 16.13 0.77 2.48 18.95
## 229 !NA DL001 40.60 83.08 3.64 34.48 21.30 19.68 0.92 13.75 28.40
## 230 !NA DL019 32.58 78.65 16.01 49.26 39.06 15.39 0.75 6.38 20.76
## 231 !NA DL001 37.80 66.03 6.37 33.32 22.92 24.17 0.97 16.69 26.08
## 232 !NA DL001 47.76 94.13 3.27 34.13 20.82 24.91 0.79 17.05 26.00
## 233 !NA DL001 39.07 75.68 3.38 36.32 23.30 22.42 0.76 13.89 32.54
## 234 !NA DL019 40.41 83.02 12.68 49.69 36.48 15.48 0.77 3.38 23.59
## 235 !NA DL019 35.08 78.99 9.70 43.71 30.92 13.76 0.78 2.82 23.46
## 236 !NA DL001 35.93 88.98 6.20 33.47 22.86 17.83 0.77 11.80 21.12
## 237 !NA DL001 68.20 122.67 7.38 62.18 39.08 28.39 0.98 16.03 38.18
## 238 !NA DL019 70.11 151.54 12.17 64.29 43.96 17.11 1.11 2.26 9.80
## 239 !NA DL001 734.56 830.10 20.72 85.46 62.22 72.95 0.58 20.49 1.19
## 240 !NA DL019 440.92 467.61 19.69 61.27 48.63 57.64 2.19 2.10 6.20
## 241 !NA DL001 332.56 461.98 1.91 43.24 24.52 40.56 0.62 20.31 24.77
## 242 !NA DL019 195.63 345.15 20.51 49.44 42.92 34.54 1.69 2.57 9.07
## 243 !NA DL019 110.42 250.46 44.49 81.54 79.64 20.11 1.11 5.05 5.36
## 244 !NA DL001 91.46 174.06 4.36 47.23 28.68 11.13 0.29 27.51 41.28
## 245 !NA DL001 24.76 45.07 2.19 24.24 14.68 15.94 0.28 10.05 40.49
## 246 !NA DL019 28.50 50.68 6.26 47.56 30.39 17.71 0.86 2.60 6.18
## 247 !NA DL019 44.48 74.74 19.24 49.16 41.78 18.21 1.15 3.52 6.61
## 248 !NA DL001 40.33 76.52 9.96 39.90 29.35 18.19 0.38 8.01 26.16
## 249 !NA DL001 86.82 110.94 2.34 36.47 21.31 23.87 0.97 13.54 38.81
## 250 !NA DL019 91.49 123.75 5.39 49.23 30.57 30.46 1.26 3.10 9.15
## 251 !NA DL019 68.75 107.60 18.33 47.52 40.21 25.29 1.15 3.95 6.80
## 252 !NA DL001 59.38 96.82 15.28 38.18 32.78 22.10 0.93 11.91 21.85
## 253 !NA DL001 85.81 128.54 8.44 37.92 27.06 22.70 0.89 13.23 22.09
## 254 !NA DL019 90.01 134.80 15.73 40.80 34.51 25.81 1.19 4.24 9.01
## 255 !NA DL001 95.12 121.26 3.85 29.23 18.68 24.64 1.15 9.58 14.12
## 256 !NA DL019 117.11 151.92 30.51 42.87 47.84 33.02 1.61 3.96 3.89
## 257 !NA DL001 88.85 108.21 3.85 27.46 17.74 21.40 0.96 7.13 11.60
## 258 !NA DL019 86.00 123.29 4.48 36.07 22.76 26.83 1.07 3.60 7.08
## 259 !NA DL001 77.09 104.93 3.00 19.28 12.71 23.75 0.83 8.26 17.46
## 260 !NA DL001 113.53 161.86 4.36 29.42 19.20 27.56 1.02 11.43 20.82
## 261 !NA DL019 119.33 180.12 10.17 47.02 33.31 25.70 1.67 4.11 8.86
## 262 !NA DL001 117.48 170.85 5.80 28.96 20.13 20.36 1.05 9.41 4.36
## 263 !NA DL019 135.35 205.21 18.61 51.61 42.63 33.82 1.38 4.82 3.78
## 264 !NA DL001 99.90 137.06 7.12 22.38 17.70 20.11 1.34 8.25 4.99
## 265 !NA DL019 85.73 132.42 17.57 40.04 35.61 30.63 1.40 4.17 4.89
## 266 !NA DL001 104.63 153.11 7.44 32.91 23.58 19.53 1.35 8.82 4.89
## 267 !NA DL019 116.68 188.74 27.20 47.55 47.47 29.63 1.40 4.42 5.75
## Benzene Toluene Xylene AQI AQI_Bucket StationId_NA Date_NA PM2.5_NA
## 1 1.74 11.65 1.60 80 Satisfactory !NA !NA !NA
## 2 2.10 8.95 1.23 55 Satisfactory !NA !NA !NA
## 3 10.66 11.07 4.41 59 Satisfactory !NA !NA !NA
## 4 8.03 15.18 4.50 92 Satisfactory !NA !NA !NA
## 5 3.03 10.98 1.84 152 Moderate !NA !NA !NA
## 6 3.08 16.27 2.48 104 Moderate !NA !NA !NA
## 7 1.98 13.30 1.66 109 Moderate !NA !NA !NA
## 8 2.71 11.25 2.98 91 Satisfactory !NA !NA !NA
## 9 3.29 15.64 2.96 91 Satisfactory !NA !NA !NA
## 10 1.75 9.87 1.48 86 Satisfactory !NA !NA !NA
## 11 2.71 17.79 3.68 88 Satisfactory !NA !NA !NA
## 12 2.50 14.86 3.02 96 Satisfactory !NA !NA !NA
## 13 3.31 14.58 3.06 105 Moderate !NA !NA !NA
## 14 4.00 9.55 1.60 97 Satisfactory !NA !NA !NA
## 15 6.14 15.44 3.38 88 Satisfactory !NA !NA !NA
## 16 5.32 13.99 3.91 100 Satisfactory !NA !NA !NA
## 17 4.57 8.92 2.84 75 Satisfactory !NA !NA !NA
## 18 2.51 10.43 2.28 76 Satisfactory !NA !NA !NA
## 19 2.32 14.93 2.02 89 Satisfactory !NA !NA !NA
## 20 1.48 12.24 2.59 93 Satisfactory !NA !NA !NA
## 21 1.67 15.14 2.51 63 Satisfactory !NA !NA !NA
## 22 1.50 11.78 2.13 73 Satisfactory !NA !NA !NA
## 23 1.02 8.94 1.85 100 Satisfactory !NA !NA !NA
## 24 0.76 8.28 1.21 123 Moderate !NA !NA !NA
## 25 1.21 10.71 2.43 133 Moderate !NA !NA !NA
## 26 1.61 10.94 1.84 164 Moderate !NA !NA !NA
## 27 1.30 11.69 2.03 124 Moderate !NA !NA !NA
## 28 1.65 13.64 2.80 110 Moderate !NA !NA !NA
## 29 1.69 13.13 2.45 97 Satisfactory !NA !NA !NA
## 30 2.74 13.28 3.92 343 Very Poor !NA !NA !NA
## 31 1.75 10.45 1.26 221 Poor !NA !NA !NA
## 32 4.75 22.63 8.32 357 Very Poor !NA !NA !NA
## 33 5.64 28.30 0.51 372 Very Poor !NA !NA !NA
## 34 4.20 33.48 0.59 280 Poor !NA !NA !NA
## 35 4.91 22.90 6.81 244 Poor !NA !NA !NA
## 36 2.61 29.13 0.12 319 Very Poor !NA !NA !NA
## 37 4.06 14.67 3.07 308 Very Poor !NA !NA !NA
## 38 4.15 24.65 6.71 402 Severe !NA !NA !NA
## 39 8.40 63.70 3.90 403 Severe !NA !NA !NA
## 40 2.63 17.28 3.28 242 Poor !NA !NA !NA
## 41 4.86 40.12 2.04 249 Poor !NA !NA !NA
## 42 1.10 18.50 0.00 199 Moderate !NA !NA !NA
## 43 2.51 14.36 2.14 221 Poor !NA !NA !NA
## 44 1.85 8.56 0.27 352 Very Poor !NA !NA !NA
## 45 2.72 14.48 2.19 351 Very Poor !NA !NA !NA
## 46 2.71 9.38 1.37 389 Very Poor !NA !NA !NA
## 47 4.36 18.49 4.56 361 Very Poor !NA !NA !NA
## 48 0.44 1.27 0.02 171 Moderate !NA !NA !NA
## 49 1.76 7.62 0.55 163 Moderate !NA !NA !NA
## 50 1.82 20.76 1.69 131 Moderate !NA !NA !NA
## 51 5.92 28.73 2.15 201 Poor !NA !NA !NA
## 52 1.53 6.43 0.47 292 Poor !NA !NA !NA
## 53 4.18 22.69 2.07 329 Very Poor !NA !NA !NA
## 54 0.58 7.17 0.09 148 Moderate !NA !NA !NA
## 55 1.64 11.15 0.35 158 Moderate !NA !NA !NA
## 56 1.76 8.57 0.11 200 Moderate !NA !NA !NA
## 57 0.58 1.65 0.00 190 Moderate !NA !NA !NA
## 58 0.94 9.92 0.03 339 Very Poor !NA !NA !NA
## 59 3.03 10.49 0.58 321 Very Poor !NA !NA !NA
## 60 1.13 7.45 0.01 346 Very Poor !NA !NA !NA
## 61 3.58 13.83 0.69 344 Very Poor !NA !NA !NA
## 62 3.70 20.42 4.85 380 Very Poor !NA !NA !NA
## 63 3.72 15.44 3.03 366 Very Poor !NA !NA !NA
## 64 0.74 3.54 0.14 267 Poor !NA !NA !NA
## 65 0.86 2.85 0.15 249 Poor !NA !NA !NA
## 66 2.51 12.12 0.64 122 Moderate !NA !NA !NA
## 67 1.07 18.50 0.88 112 Moderate !NA !NA !NA
## 68 0.49 21.40 0.09 133 Moderate !NA !NA !NA
## 69 2.45 8.49 0.54 141 Moderate !NA !NA !NA
## 70 2.72 10.82 1.51 338 Very Poor !NA !NA !NA
## 71 1.80 10.75 1.74 366 Very Poor !NA !NA !NA
## 72 0.84 4.52 0.02 297 Poor !NA !NA !NA
## 73 2.74 9.93 0.98 310 Very Poor !NA !NA !NA
## 74 1.97 7.96 0.37 319 Very Poor !NA !NA !NA
## 75 0.50 8.24 0.00 291 Poor !NA !NA !NA
## 76 0.71 3.79 0.09 245 Poor !NA !NA !NA
## 77 2.91 13.07 1.39 215 Poor !NA !NA !NA
## 78 3.69 19.43 2.45 322 Very Poor !NA !NA !NA
## 79 0.40 1.12 0.00 281 Poor !NA !NA !NA
## 80 0.31 1.51 0.00 119 Moderate !NA !NA !NA
## 81 1.75 10.35 1.09 147 Moderate !NA !NA !NA
## 82 1.73 7.28 1.01 161 Moderate !NA !NA !NA
## 83 0.94 5.34 1.44 146 Moderate !NA !NA !NA
## 84 0.21 0.72 0.00 119 Moderate !NA !NA !NA
## 85 1.08 5.51 0.21 127 Moderate !NA !NA !NA
## 86 0.17 1.01 0.00 77 Satisfactory !NA !NA !NA
## 87 1.30 3.30 0.00 185 Moderate !NA !NA !NA
## 88 1.06 6.11 0.21 250 Poor !NA !NA !NA
## 89 0.68 2.66 0.20 212 Poor !NA !NA !NA
## 90 1.32 3.52 0.09 149 Moderate !NA !NA !NA
## 91 0.58 7.68 0.05 146 Moderate !NA !NA !NA
## 92 2.57 10.15 0.90 212 Poor !NA !NA !NA
## 93 0.51 27.04 0.13 135 Moderate !NA !NA !NA
## 94 1.78 4.76 0.13 170 Moderate !NA !NA !NA
## 95 0.68 3.98 0.04 153 Moderate !NA !NA !NA
## 96 1.77 8.04 0.16 156 Moderate !NA !NA !NA
## 97 2.41 13.08 0.40 181 Moderate !NA !NA !NA
## 98 1.09 5.54 0.00 146 Moderate !NA !NA !NA
## 99 0.54 9.44 0.00 102 Moderate !NA !NA !NA
## 100 1.71 8.43 0.13 135 Moderate !NA !NA !NA
## 101 1.55 5.94 0.14 179 Moderate !NA !NA !NA
## 102 1.12 4.88 0.09 155 Moderate !NA !NA !NA
## 103 3.87 18.15 1.36 224 Poor !NA !NA !NA
## 104 5.04 17.64 1.95 291 Poor !NA !NA !NA
## 105 1.91 11.61 0.68 194 Moderate !NA !NA !NA
## 106 1.43 6.93 0.30 158 Moderate !NA !NA !NA
## 107 1.13 9.78 0.25 146 Moderate !NA !NA !NA
## 108 1.38 6.55 0.61 178 Moderate !NA !NA !NA
## 109 0.30 1.84 0.01 104 Moderate !NA !NA !NA
## 110 1.46 5.31 0.11 91 Satisfactory !NA !NA !NA
## 111 2.07 17.25 0.70 127 Moderate !NA !NA !NA
## 112 1.56 3.40 0.05 252 Poor !NA !NA !NA
## 113 3.24 14.85 0.97 203 Poor !NA !NA !NA
## 114 1.57 5.89 0.09 278 Poor !NA !NA !NA
## 115 1.20 5.09 0.16 194 Moderate !NA !NA !NA
## 116 5.45 18.65 1.65 174 Moderate !NA !NA !NA
## 117 3.56 15.63 1.30 172 Moderate !NA !NA !NA
## 118 2.67 13.55 1.74 243 Poor !NA !NA !NA
## 119 1.28 4.89 0.06 183 Moderate !NA !NA !NA
## 120 1.14 4.97 0.18 159 Moderate !NA !NA !NA
## 121 3.31 12.55 3.59 139 Moderate !NA !NA !NA
## 122 2.30 5.44 0.25 117 Moderate !NA !NA !NA
## 123 0.09 0.45 0.00 221 Poor !NA !NA !NA
## 124 0.28 1.13 0.02 134 Moderate !NA !NA !NA
## 125 0.75 5.62 0.27 115 Moderate !NA !NA !NA
## 126 1.16 7.35 0.53 159 Moderate !NA !NA !NA
## 127 2.88 7.55 3.09 153 Moderate !NA !NA !NA
## 128 1.25 4.33 0.31 113 Moderate !NA !NA !NA
## 129 1.86 19.13 2.46 76 Satisfactory !NA !NA !NA
## 130 1.61 9.64 0.57 123 Moderate !NA !NA !NA
## 131 1.60 15.18 1.18 79 Satisfactory !NA !NA !NA
## 132 0.52 2.24 0.00 108 Moderate !NA !NA !NA
## 133 1.23 14.38 0.50 76 Satisfactory !NA !NA !NA
## 134 0.75 3.90 0.04 100 Satisfactory !NA !NA !NA
## 135 1.21 16.22 0.61 112 Moderate !NA !NA !NA
## 136 0.76 3.66 0.11 130 Moderate !NA !NA !NA
## 137 0.25 27.44 0.43 68 Satisfactory !NA !NA !NA
## 138 1.03 6.48 0.10 103 Moderate !NA !NA !NA
## 139 0.58 22.91 1.12 77 Satisfactory !NA !NA !NA
## 140 0.57 5.23 0.12 88 Satisfactory !NA !NA !NA
## 141 1.09 28.35 1.73 154 Moderate !NA !NA !NA
## 142 1.46 8.45 0.88 108 Moderate !NA !NA !NA
## 143 0.29 17.95 1.00 114 Moderate !NA !NA !NA
## 144 1.55 7.70 0.60 132 Moderate !NA !NA !NA
## 145 0.23 16.80 0.74 128 Moderate !NA !NA !NA
## 146 0.88 7.71 0.24 123 Moderate !NA !NA !NA
## 147 1.75 6.83 0.87 106 Moderate !NA !NA !NA
## 148 0.19 11.27 0.45 101 Moderate !NA !NA !NA
## 149 1.87 13.82 1.00 179 Moderate !NA !NA !NA
## 150 0.03 7.95 0.25 128 Moderate !NA !NA !NA
## 151 0.40 24.55 0.49 122 Moderate !NA !NA !NA
## 152 1.28 7.79 0.99 194 Moderate !NA !NA !NA
## 153 0.11 11.26 0.57 81 Satisfactory !NA !NA !NA
## 154 0.35 2.89 0.15 98 Satisfactory !NA !NA !NA
## 155 0.67 15.19 1.02 63 Satisfactory !NA !NA !NA
## 156 0.90 5.47 0.28 92 Satisfactory !NA !NA !NA
## 157 0.79 4.00 0.41 78 Satisfactory !NA !NA !NA
## 158 0.55 10.18 0.60 59 Satisfactory !NA !NA !NA
## 159 0.43 7.97 0.51 40 Good !NA !NA !NA
## 160 0.13 1.31 0.01 81 Satisfactory !NA !NA !NA
## 161 0.19 1.17 0.00 72 Satisfactory !NA !NA !NA
## 162 0.52 7.64 0.30 42 Good !NA !NA !NA
## 163 0.51 6.68 0.48 47 Good !NA !NA !NA
## 164 0.67 2.92 0.11 79 Satisfactory !NA !NA !NA
## 165 0.25 1.65 0.01 88 Satisfactory !NA !NA !NA
## 166 0.48 6.22 0.27 45 Good !NA !NA !NA
## 167 0.17 18.14 0.19 93 Satisfactory !NA !NA !NA
## 168 1.74 6.99 2.46 95 Satisfactory !NA !NA !NA
## 169 0.08 8.63 0.89 49 Good !NA !NA !NA
## 170 1.51 11.10 0.91 84 Satisfactory !NA !NA !NA
## 171 0.81 7.63 0.24 88 Satisfactory !NA !NA !NA
## 172 0.19 7.54 0.44 57 Satisfactory !NA !NA !NA
## 173 0.03 8.23 0.45 105 Moderate !NA !NA !NA
## 174 0.30 0.97 0.02 76 Satisfactory !NA !NA !NA
## 175 0.05 14.82 1.01 109 Moderate !NA !NA !NA
## 176 0.23 1.76 0.00 76 Satisfactory !NA !NA !NA
## 177 0.17 1.04 0.00 75 Satisfactory !NA !NA !NA
## 178 0.02 15.16 0.98 101 Moderate !NA !NA !NA
## 179 0.76 3.64 0.34 75 Satisfactory !NA !NA !NA
## 180 0.06 13.62 1.39 92 Satisfactory !NA !NA !NA
## 181 1.14 7.20 0.67 84 Satisfactory !NA !NA !NA
## 182 0.19 19.90 1.27 93 Satisfactory !NA !NA !NA
## 183 0.83 8.12 0.48 98 Satisfactory !NA !NA !NA
## 184 0.44 16.33 1.01 51 Satisfactory !NA !NA !NA
## 185 0.11 5.80 0.10 42 Good !NA !NA !NA
## 186 0.24 1.57 0.06 72 Satisfactory !NA !NA !NA
## 187 0.10 1.03 0.00 59 Satisfactory !NA !NA !NA
## 188 0.21 8.30 0.54 43 Good !NA !NA !NA
## 189 0.37 3.29 0.12 51 Satisfactory !NA !NA !NA
## 190 0.25 15.44 0.61 52 Satisfactory !NA !NA !NA
## 191 1.60 10.31 1.19 92 Satisfactory !NA !NA !NA
## 192 0.73 16.48 1.80 64 Satisfactory !NA !NA !NA
## 193 1.65 8.65 0.67 102 Moderate !NA !NA !NA
## 194 2.31 11.46 3.52 61 Satisfactory !NA !NA !NA
## 195 1.07 5.21 0.23 111 Moderate !NA !NA !NA
## 196 6.91 11.97 9.88 87 Satisfactory !NA !NA !NA
## 197 1.31 8.24 0.75 118 Moderate !NA !NA !NA
## 198 1.41 13.92 2.54 117 Moderate !NA !NA !NA
## 199 0.83 7.35 1.15 92 Satisfactory !NA !NA !NA
## 200 0.97 7.48 0.43 102 Moderate !NA !NA !NA
## 201 0.84 7.97 0.35 74 Satisfactory !NA !NA !NA
## 202 0.38 5.38 0.69 51 Satisfactory !NA !NA !NA
## 203 0.51 5.42 0.60 52 Satisfactory !NA !NA !NA
## 204 1.14 7.84 0.59 87 Satisfactory !NA !NA !NA
## 205 0.45 3.29 0.05 86 Satisfactory !NA !NA !NA
## 206 0.20 9.26 0.62 46 Good !NA !NA !NA
## 207 1.14 5.79 0.41 101 Moderate !NA !NA !NA
## 208 0.16 19.63 2.29 80 Satisfactory !NA !NA !NA
## 209 1.44 7.47 0.48 124 Moderate !NA !NA !NA
## 210 0.55 18.39 1.69 106 Moderate !NA !NA !NA
## 211 1.12 18.21 1.68 87 Satisfactory !NA !NA !NA
## 212 0.44 2.70 0.08 111 Moderate !NA !NA !NA
## 213 0.24 1.86 0.05 88 Satisfactory !NA !NA !NA
## 214 0.59 15.64 0.71 56 Satisfactory !NA !NA !NA
## 215 0.43 3.65 0.14 84 Satisfactory !NA !NA !NA
## 216 1.57 19.25 1.10 91 Satisfactory !NA !NA !NA
## 217 1.21 25.63 4.40 101 Moderate !NA !NA !NA
## 218 0.65 3.40 0.29 91 Satisfactory !NA !NA !NA
## 219 0.58 4.69 0.24 79 Satisfactory !NA !NA !NA
## 220 2.40 22.63 4.60 80 Satisfactory !NA !NA !NA
## 221 1.49 14.11 2.01 62 Satisfactory !NA !NA !NA
## 222 0.85 4.85 0.29 86 Satisfactory !NA !NA !NA
## 223 2.39 30.38 5.22 63 Satisfactory !NA !NA !NA
## 224 0.82 5.98 0.52 104 Moderate !NA !NA !NA
## 225 0.38 2.94 0.19 41 Good !NA !NA !NA
## 226 0.26 2.43 0.04 60 Satisfactory !NA !NA !NA
## 227 0.65 6.37 0.76 76 Satisfactory !NA !NA !NA
## 228 0.74 3.99 0.27 60 Satisfactory !NA !NA !NA
## 229 0.89 9.49 1.54 81 Satisfactory !NA !NA !NA
## 230 0.54 3.09 0.11 86 Satisfactory !NA !NA !NA
## 231 0.56 6.29 1.50 74 Satisfactory !NA !NA !NA
## 232 0.58 5.08 0.63 74 Satisfactory !NA !NA !NA
## 233 0.42 6.34 0.57 89 Satisfactory !NA !NA !NA
## 234 1.46 7.39 0.24 90 Satisfactory !NA !NA !NA
## 235 1.61 8.07 0.58 89 Satisfactory !NA !NA !NA
## 236 0.43 4.77 0.72 76 Satisfactory !NA !NA !NA
## 237 0.75 11.90 0.81 170 Moderate !NA !NA !NA
## 238 2.43 13.59 0.91 143 Moderate !NA !NA !NA
## 239 2.90 18.12 3.82 668 Severe !NA !NA !NA
## 240 5.22 16.70 1.62 439 Severe !NA !NA !NA
## 241 1.67 28.89 1.98 692 Severe !NA !NA !NA
## 242 4.58 16.41 2.30 404 Severe !NA !NA !NA
## 243 5.64 20.27 2.79 262 Poor !NA !NA !NA
## 244 1.43 3.22 0.57 224 Poor !NA !NA !NA
## 245 0.23 1.23 0.09 80 Satisfactory !NA !NA !NA
## 246 1.33 4.66 0.10 93 Satisfactory !NA !NA !NA
## 247 2.66 13.04 1.44 66 Satisfactory !NA !NA !NA
## 248 0.66 11.26 0.63 62 Satisfactory !NA !NA !NA
## 249 0.80 3.51 0.33 328 Very Poor !NA !NA !NA
## 250 1.74 5.76 0.13 318 Very Poor !NA !NA !NA
## 251 3.14 15.14 0.87 156 Moderate !NA !NA !NA
## 252 0.64 3.74 0.35 152 Moderate !NA !NA !NA
## 253 0.83 10.23 0.33 141 Moderate !NA !NA !NA
## 254 2.54 12.27 0.51 174 Moderate !NA !NA !NA
## 255 0.88 2.85 0.18 270 Poor !NA !NA !NA
## 256 2.75 11.82 0.61 302 Very Poor !NA !NA !NA
## 257 0.80 3.72 0.28 212 Poor !NA !NA !NA
## 258 1.41 5.17 0.09 237 Poor !NA !NA !NA
## 259 0.58 3.87 0.08 159 Moderate !NA !NA !NA
## 260 0.94 23.62 0.66 237 Poor !NA !NA !NA
## 261 3.62 13.90 0.83 248 Poor !NA !NA !NA
## 262 1.14 5.77 0.61 282 Poor !NA !NA !NA
## 263 2.60 19.11 0.22 280 Poor !NA !NA !NA
## 264 1.00 8.24 0.63 284 Poor !NA !NA !NA
## 265 3.21 13.55 0.37 269 Poor !NA !NA !NA
## 266 1.27 11.09 0.58 204 Poor !NA !NA !NA
## 267 3.86 17.94 0.53 238 Poor !NA !NA !NA
## PM10_NA NO_NA NO2_NA NOx_NA NH3_NA CO_NA SO2_NA O3_NA Benzene_NA Toluene_NA
## 1 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 2 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 3 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 4 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 5 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 6 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 7 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 8 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 9 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 10 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 11 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 12 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 13 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 14 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 15 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 16 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 17 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 18 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 19 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 20 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 21 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 22 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 23 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 24 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 25 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 26 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 27 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 28 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 29 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 30 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 31 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 32 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 33 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 34 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 35 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 36 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 37 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 38 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 39 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 40 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 41 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 42 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 43 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 44 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 45 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 46 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 47 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 48 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 49 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 50 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 51 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 52 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 53 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 54 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 55 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 56 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 57 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 58 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 59 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 60 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 61 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 62 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 63 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 64 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 65 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 66 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 67 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 68 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 69 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 70 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 71 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 72 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 73 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 74 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 75 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 76 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 77 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 78 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 79 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 80 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 81 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 82 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 83 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 84 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 85 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 86 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 87 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 88 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 89 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 90 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 91 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 92 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 93 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 94 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 95 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 96 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 97 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 98 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 99 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 100 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 101 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 102 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 103 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 104 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 105 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 106 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 107 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 108 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 109 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 110 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 111 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 112 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 113 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 114 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 115 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 116 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 117 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 118 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 119 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 120 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 121 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 122 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 123 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 124 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 125 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 126 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 127 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 128 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 129 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 130 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 131 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 132 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 133 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 134 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 135 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 136 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 137 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 138 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 139 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 140 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 141 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 142 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 143 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 144 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 145 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 146 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 147 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 148 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 149 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 150 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 151 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 152 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 153 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 154 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 155 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 156 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 157 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 158 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 159 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 160 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 161 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 162 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 163 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 164 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 165 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 166 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 167 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 168 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 169 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 170 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 171 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 172 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 173 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 174 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 175 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 176 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 177 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 178 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 179 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 180 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 181 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 182 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 183 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 184 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 185 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 186 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 187 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 188 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 189 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 190 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 191 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 192 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 193 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 194 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 195 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 196 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 197 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 198 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 199 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 200 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 201 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 202 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 203 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 204 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 205 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 206 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 207 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 208 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 209 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 210 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 211 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 212 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 213 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 214 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 215 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 216 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 217 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 218 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 219 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 220 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 221 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 222 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 223 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 224 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 225 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 226 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 227 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 228 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 229 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 230 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 231 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 232 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 233 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 234 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 235 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 236 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 237 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 238 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 239 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 240 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 241 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 242 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 243 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 244 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 245 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 246 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 247 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 248 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 249 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 250 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 251 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 252 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 253 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 254 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 255 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 256 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 257 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 258 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 259 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 260 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 261 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 262 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 263 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 264 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 265 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 266 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 267 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## Xylene_NA AQI_NA AQI_Bucket_NA Daily_Delay Month
## 1 !NA !NA !NA 42 6
## 2 !NA !NA !NA 190 6
## 3 !NA !NA !NA 66 6
## 4 !NA !NA !NA 30 6
## 5 !NA !NA !NA 120 7
## 6 !NA !NA !NA 31 7
## 7 !NA !NA !NA 8 7
## 8 !NA !NA !NA 40 7
## 9 !NA !NA !NA 58 7
## 10 !NA !NA !NA 32 7
## 11 !NA !NA !NA 43 7
## 12 !NA !NA !NA 48 7
## 13 !NA !NA !NA 31 7
## 14 !NA !NA !NA 40 7
## 15 !NA !NA !NA 59 7
## 16 !NA !NA !NA 140 7
## 17 !NA !NA !NA 45 7
## 18 !NA !NA !NA 62 7
## 19 !NA !NA !NA 50 7
## 20 !NA !NA !NA 100 7
## 21 !NA !NA !NA 61 7
## 22 !NA !NA !NA 50 7
## 23 !NA !NA !NA 109 7
## 24 !NA !NA !NA 42 7
## 25 !NA !NA !NA 33 7
## 26 !NA !NA !NA 40 8
## 27 !NA !NA !NA 37 8
## 28 !NA !NA !NA 62 8
## 29 !NA !NA !NA 105 8
## 30 !NA !NA !NA 35 11
## 31 !NA !NA !NA 23 11
## 32 !NA !NA !NA 58 11
## 33 !NA !NA !NA 58 11
## 34 !NA !NA !NA 60 11
## 35 !NA !NA !NA 60 11
## 36 !NA !NA !NA 46 11
## 37 !NA !NA !NA 46 11
## 38 !NA !NA !NA 63 12
## 39 !NA !NA !NA 63 12
## 40 !NA !NA !NA 106 12
## 41 !NA !NA !NA 106 12
## 42 !NA !NA !NA 145 12
## 43 !NA !NA !NA 145 12
## 44 !NA !NA !NA 34 1
## 45 !NA !NA !NA 34 1
## 46 !NA !NA !NA 31 1
## 47 !NA !NA !NA 31 1
## 48 !NA !NA !NA 36 1
## 49 !NA !NA !NA 36 1
## 50 !NA !NA !NA 66 1
## 51 !NA !NA !NA 66 1
## 52 !NA !NA !NA 45 1
## 53 !NA !NA !NA 45 1
## 54 !NA !NA !NA 45 1
## 55 !NA !NA !NA 45 1
## 56 !NA !NA !NA 56 1
## 57 !NA !NA !NA 56 1
## 58 !NA !NA !NA 52 2
## 59 !NA !NA !NA 52 2
## 60 !NA !NA !NA 189 2
## 61 !NA !NA !NA 189 2
## 62 !NA !NA !NA 43 2
## 63 !NA !NA !NA 43 2
## 64 !NA !NA !NA 63 2
## 65 !NA !NA !NA 63 2
## 66 !NA !NA !NA 98 2
## 67 !NA !NA !NA 98 2
## 68 !NA !NA !NA 207 2
## 69 !NA !NA !NA 207 2
## 70 !NA !NA !NA 67 2
## 71 !NA !NA !NA 67 2
## 72 !NA !NA !NA 43 2
## 73 !NA !NA !NA 43 2
## 74 !NA !NA !NA 34 2
## 75 !NA !NA !NA 34 2
## 76 !NA !NA !NA 70 2
## 77 !NA !NA !NA 70 2
## 78 !NA !NA !NA 29 2
## 79 !NA !NA !NA 29 2
## 80 !NA !NA !NA 54 2
## 81 !NA !NA !NA 54 2
## 82 !NA !NA !NA 65 2
## 83 !NA !NA !NA 65 2
## 84 !NA !NA !NA 43 2
## 85 !NA !NA !NA 43 2
## 86 !NA !NA !NA 34 2
## 87 !NA !NA !NA 57 3
## 88 !NA !NA !NA 57 3
## 89 !NA !NA !NA 57 3
## 90 !NA !NA !NA 57 3
## 91 !NA !NA !NA 33 3
## 92 !NA !NA !NA 33 3
## 93 !NA !NA !NA 184 3
## 94 !NA !NA !NA 184 3
## 95 !NA !NA !NA 27 3
## 96 !NA !NA !NA 27 3
## 97 !NA !NA !NA 104 3
## 98 !NA !NA !NA 104 3
## 99 !NA !NA !NA 33 3
## 100 !NA !NA !NA 33 3
## 101 !NA !NA !NA 56 3
## 102 !NA !NA !NA 41 4
## 103 !NA !NA !NA 62 4
## 104 !NA !NA !NA 38 4
## 105 !NA !NA !NA 27 4
## 106 !NA !NA !NA 40 4
## 107 !NA !NA !NA 40 4
## 108 !NA !NA !NA 93 4
## 109 !NA !NA !NA 68 4
## 110 !NA !NA !NA 52 4
## 111 !NA !NA !NA 64 4
## 112 !NA !NA !NA 51 5
## 113 !NA !NA !NA 72 5
## 114 !NA !NA !NA 29 5
## 115 !NA !NA !NA 24 5
## 116 !NA !NA !NA 41 5
## 117 !NA !NA !NA 32 5
## 118 !NA !NA !NA 34 5
## 119 !NA !NA !NA 90 5
## 120 !NA !NA !NA 45 5
## 121 !NA !NA !NA 22 5
## 122 !NA !NA !NA 41 5
## 123 !NA !NA !NA 87 6
## 124 !NA !NA !NA 63 6
## 125 !NA !NA !NA 76 6
## 126 !NA !NA !NA 80 6
## 127 !NA !NA !NA 82 6
## 128 !NA !NA !NA 56 7
## 129 !NA !NA !NA 50 7
## 130 !NA !NA !NA 50 7
## 131 !NA !NA !NA 47 7
## 132 !NA !NA !NA 47 7
## 133 !NA !NA !NA 55 7
## 134 !NA !NA !NA 55 7
## 135 !NA !NA !NA 170 7
## 136 !NA !NA !NA 170 7
## 137 !NA !NA !NA 46 7
## 138 !NA !NA !NA 46 7
## 139 !NA !NA !NA 159 7
## 140 !NA !NA !NA 159 7
## 141 !NA !NA !NA 97 7
## 142 !NA !NA !NA 97 7
## 143 !NA !NA !NA 56 7
## 144 !NA !NA !NA 56 7
## 145 !NA !NA !NA 313 7
## 146 !NA !NA !NA 313 7
## 147 !NA !NA !NA 32 7
## 148 !NA !NA !NA 32 7
## 149 !NA !NA !NA 65 7
## 150 !NA !NA !NA 65 7
## 151 !NA !NA !NA 44 7
## 152 !NA !NA !NA 44 7
## 153 !NA !NA !NA 25 7
## 154 !NA !NA !NA 25 7
## 155 !NA !NA !NA 29 7
## 156 !NA !NA !NA 29 7
## 157 !NA !NA !NA 78 7
## 158 !NA !NA !NA 78 7
## 159 !NA !NA !NA 126 7
## 160 !NA !NA !NA 126 7
## 161 !NA !NA !NA 41 7
## 162 !NA !NA !NA 41 7
## 163 !NA !NA !NA 68 7
## 164 !NA !NA !NA 68 7
## 165 !NA !NA !NA 42 7
## 166 !NA !NA !NA 42 7
## 167 !NA !NA !NA 61 8
## 168 !NA !NA !NA 61 8
## 169 !NA !NA !NA 45 8
## 170 !NA !NA !NA 45 8
## 171 !NA !NA !NA 113 8
## 172 !NA !NA !NA 113 8
## 173 !NA !NA !NA 53 8
## 174 !NA !NA !NA 53 8
## 175 !NA !NA !NA 48 8
## 176 !NA !NA !NA 48 8
## 177 !NA !NA !NA 85 8
## 178 !NA !NA !NA 85 8
## 179 !NA !NA !NA 38 8
## 180 !NA !NA !NA 38 8
## 181 !NA !NA !NA 87 8
## 182 !NA !NA !NA 87 8
## 183 !NA !NA !NA 231 8
## 184 !NA !NA !NA 231 8
## 185 !NA !NA !NA 110 8
## 186 !NA !NA !NA 110 8
## 187 !NA !NA !NA 38 8
## 188 !NA !NA !NA 38 8
## 189 !NA !NA !NA 214 8
## 190 !NA !NA !NA 214 8
## 191 !NA !NA !NA 54 8
## 192 !NA !NA !NA 54 8
## 193 !NA !NA !NA 91 8
## 194 !NA !NA !NA 91 8
## 195 !NA !NA !NA 60 8
## 196 !NA !NA !NA 60 8
## 197 !NA !NA !NA 45 8
## 198 !NA !NA !NA 45 8
## 199 !NA !NA !NA 59 8
## 200 !NA !NA !NA 59 8
## 201 !NA !NA !NA 111 8
## 202 !NA !NA !NA 111 8
## 203 !NA !NA !NA 129 8
## 204 !NA !NA !NA 129 8
## 205 !NA !NA !NA 59 9
## 206 !NA !NA !NA 59 9
## 207 !NA !NA !NA 15 9
## 208 !NA !NA !NA 15 9
## 209 !NA !NA !NA 73 9
## 210 !NA !NA !NA 73 9
## 211 !NA !NA !NA 43 9
## 212 !NA !NA !NA 43 9
## 213 !NA !NA !NA 41 9
## 214 !NA !NA !NA 41 9
## 215 !NA !NA !NA 50 9
## 216 !NA !NA !NA 50 9
## 217 !NA !NA !NA 74 9
## 218 !NA !NA !NA 74 9
## 219 !NA !NA !NA 50 9
## 220 !NA !NA !NA 50 9
## 221 !NA !NA !NA 83 9
## 222 !NA !NA !NA 83 9
## 223 !NA !NA !NA 36 9
## 224 !NA !NA !NA 36 9
## 225 !NA !NA !NA 35 9
## 226 !NA !NA !NA 35 9
## 227 !NA !NA !NA 72 9
## 228 !NA !NA !NA 72 9
## 229 !NA !NA !NA 56 10
## 230 !NA !NA !NA 56 10
## 231 !NA !NA !NA 45 10
## 232 !NA !NA !NA 66 10
## 233 !NA !NA !NA 90 10
## 234 !NA !NA !NA 90 10
## 235 !NA !NA !NA 205 10
## 236 !NA !NA !NA 205 10
## 237 !NA !NA !NA 72 10
## 238 !NA !NA !NA 72 10
## 239 !NA !NA !NA 65 11
## 240 !NA !NA !NA 65 11
## 241 !NA !NA !NA 279 11
## 242 !NA !NA !NA 279 11
## 243 !NA !NA !NA 69 11
## 244 !NA !NA !NA 69 11
## 245 !NA !NA !NA 50 11
## 246 !NA !NA !NA 50 11
## 247 !NA !NA !NA 74 11
## 248 !NA !NA !NA 74 11
## 249 !NA !NA !NA 420 12
## 250 !NA !NA !NA 420 12
## 251 !NA !NA !NA 98 12
## 252 !NA !NA !NA 98 12
## 253 !NA !NA !NA 42 12
## 254 !NA !NA !NA 42 12
## 255 !NA !NA !NA 157 1
## 256 !NA !NA !NA 57 1
## 257 !NA !NA !NA 57 1
## 258 !NA !NA !NA 20 1
## 259 !NA !NA !NA 20 1
## 260 !NA !NA !NA 116 1
## 261 !NA !NA !NA 116 1
## 262 !NA !NA !NA 26 1
## 263 !NA !NA !NA 26 1
## 264 !NA !NA !NA 48 1
## 265 !NA !NA !NA 48 1
## 266 !NA !NA !NA 45 1
## 267 !NA !NA !NA 45 1
head(Delhi_cohesive_dataset)
## Date any_missing tavg tmin tmax prcp time_NA tavg_NA tmin_NA tmax_NA
## 1 2018-06-27 Not Missing 30.3 26.2 37.5 3.0 !NA !NA !NA !NA
## 2 2018-06-28 Not Missing 29.9 24.2 37.5 20.1 !NA !NA !NA !NA
## 3 2018-06-29 Not Missing 30.7 27.9 35.2 1.0 !NA !NA !NA !NA
## 4 2018-06-30 Not Missing 31.3 27.5 35.6 9.9 !NA !NA !NA !NA
## 5 2018-07-04 Not Missing 31.7 26.1 36.7 5.1 !NA !NA !NA !NA
## 6 2018-07-06 Not Missing 32.9 28.1 37.3 5.1 !NA !NA !NA !NA
## prcp_NA StationId PM2.5 PM10 NO NO2 NOx NH3 CO SO2 O3
## 1 !NA DL019 48.03 89.10 4.09 39.86 24.30 18.91 0.68 12.71 9.14
## 2 !NA DL019 23.98 38.46 3.64 34.88 21.51 26.11 0.52 11.41 6.42
## 3 !NA DL019 34.77 60.62 11.30 53.24 37.53 38.76 0.75 9.87 11.59
## 4 !NA DL019 42.65 113.91 5.90 50.46 31.40 21.05 0.91 12.43 10.90
## 5 !NA DL019 44.09 138.82 2.30 34.02 19.90 27.99 0.56 7.27 13.37
## 6 !NA DL019 48.80 110.93 8.11 29.32 22.19 35.32 0.53 10.14 15.38
## Benzene Toluene Xylene AQI AQI_Bucket StationId_NA Date_NA PM2.5_NA PM10_NA
## 1 1.74 11.65 1.60 80 Satisfactory !NA !NA !NA !NA
## 2 2.10 8.95 1.23 55 Satisfactory !NA !NA !NA !NA
## 3 10.66 11.07 4.41 59 Satisfactory !NA !NA !NA !NA
## 4 8.03 15.18 4.50 92 Satisfactory !NA !NA !NA !NA
## 5 3.03 10.98 1.84 152 Moderate !NA !NA !NA !NA
## 6 3.08 16.27 2.48 104 Moderate !NA !NA !NA !NA
## NO_NA NO2_NA NOx_NA NH3_NA CO_NA SO2_NA O3_NA Benzene_NA Toluene_NA Xylene_NA
## 1 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 2 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 3 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 4 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 5 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## 6 !NA !NA !NA !NA !NA !NA !NA !NA !NA !NA
## AQI_NA AQI_Bucket_NA Daily_Delay Month
## 1 !NA !NA 42 6
## 2 !NA !NA 190 6
## 3 !NA !NA 66 6
## 4 !NA !NA 30 6
## 5 !NA !NA 120 7
## 6 !NA !NA 31 7
#Looks like the merger is successful with no NA
#Lets summarize full dataset
summary(Delhi_cohesive_dataset)
## Date any_missing tavg tmin
## Min. :2018-06-27 Length:267 Min. :10.40 Min. : 5.30
## 1st Qu.:2019-02-08 Class :character 1st Qu.:18.30 1st Qu.:12.80
## Median :2019-07-07 Mode :character Median :28.20 Median :23.80
## Mean :2019-05-16 Mean :25.29 Mean :20.24
## 3rd Qu.:2019-08-25 3rd Qu.:30.90 3rd Qu.:26.20
## Max. :2020-01-18 Max. :35.60 Max. :28.80
## tmax prcp time_NA tavg_NA tmin_NA tmax_NA
## Min. :14.60 Min. : 0.000 !NA:267 !NA:267 !NA:267 !NA:267
## 1st Qu.:25.30 1st Qu.: 0.000 NA : 0 NA : 0 NA : 0 NA : 0
## Median :33.80 Median : 0.500
## Mean :30.94 Mean : 5.285
## 3rd Qu.:36.00 3rd Qu.: 5.100
## Max. :43.40 Max. :70.100
## prcp_NA StationId PM2.5 PM10 NO
## !NA:267 Length:267 Min. : 10.06 Min. : 10.75 Min. : 1.01
## NA : 0 Class :character 1st Qu.: 33.75 1st Qu.: 79.05 1st Qu.: 3.42
## Mode :character Median : 52.11 Median :113.91 Median : 7.12
## Mean : 69.21 Mean :128.93 Mean :11.75
## 3rd Qu.: 86.25 3rd Qu.:159.65 3rd Qu.:15.42
## Max. :734.56 Max. :830.10 Max. :67.89
## NO2 NOx NH3 CO
## Min. : 17.46 Min. :10.37 Min. : 6.79 Min. :0.2800
## 1st Qu.: 32.93 1st Qu.:22.20 1st Qu.:19.58 1st Qu.:0.6800
## Median : 41.93 Median :28.68 Median :24.64 Median :0.8100
## Mean : 44.17 Mean :32.92 Mean :26.77 Mean :0.8846
## 3rd Qu.: 51.61 3rd Qu.:39.95 3rd Qu.:30.82 3rd Qu.:0.9900
## Max. :112.54 Max. :94.31 Max. :72.95 Max. :2.8000
## SO2 O3 Benzene Toluene
## Min. : 2.100 Min. : 1.19 Min. : 0.020 Min. : 0.45
## 1st Qu.: 5.065 1st Qu.:11.60 1st Qu.: 0.580 1st Qu.: 5.40
## Median : 9.930 Median :18.62 Median : 1.210 Median : 8.95
## Mean :10.781 Mean :19.66 Mean : 1.667 Mean :10.77
## 3rd Qu.:12.450 3rd Qu.:24.61 3rd Qu.: 2.355 3rd Qu.:14.84
## Max. :32.840 Max. :74.89 Max. :10.660 Max. :63.70
## Xylene AQI AQI_Bucket StationId_NA Date_NA
## Min. :0.000 Min. : 40.0 Length:267 !NA:267 !NA:267
## 1st Qu.:0.180 1st Qu.: 86.5 Class :character NA : 0 NA : 0
## Median :0.600 Median :119.0 Mode :character
## Mean :1.113 Mean :156.3
## 3rd Qu.:1.600 3rd Qu.:202.0
## Max. :9.880 Max. :692.0
## PM2.5_NA PM10_NA NO_NA NO2_NA NOx_NA NH3_NA CO_NA
## !NA:267 !NA:267 !NA:267 !NA:267 !NA:267 !NA:267 !NA:267
## NA : 0 NA : 0 NA : 0 NA : 0 NA : 0 NA : 0 NA : 0
##
##
##
##
## SO2_NA O3_NA Benzene_NA Toluene_NA Xylene_NA AQI_NA AQI_Bucket_NA
## !NA:267 !NA:267 !NA:267 !NA:267 !NA:267 !NA:267 !NA:267
## NA : 0 NA : 0 NA : 0 NA : 0 NA : 0 NA : 0 NA : 0
##
##
##
##
## Daily_Delay Month
## Min. : 8.00 Min. : 1.000
## 1st Qu.: 42.00 1st Qu.: 3.000
## Median : 56.00 Median : 7.000
## Mean : 73.72 Mean : 6.341
## 3rd Qu.: 81.00 3rd Qu.: 8.000
## Max. :420.00 Max. :12.000
names(Delhi_cohesive_dataset)
## [1] "Date" "any_missing" "tavg" "tmin"
## [5] "tmax" "prcp" "time_NA" "tavg_NA"
## [9] "tmin_NA" "tmax_NA" "prcp_NA" "StationId"
## [13] "PM2.5" "PM10" "NO" "NO2"
## [17] "NOx" "NH3" "CO" "SO2"
## [21] "O3" "Benzene" "Toluene" "Xylene"
## [25] "AQI" "AQI_Bucket" "StationId_NA" "Date_NA"
## [29] "PM2.5_NA" "PM10_NA" "NO_NA" "NO2_NA"
## [33] "NOx_NA" "NH3_NA" "CO_NA" "SO2_NA"
## [37] "O3_NA" "Benzene_NA" "Toluene_NA" "Xylene_NA"
## [41] "AQI_NA" "AQI_Bucket_NA" "Daily_Delay" "Month"
ggplot(Delhi_cohesive_dataset, aes(x = AQI, y = Daily_Delay, color = AQI_Bucket, size = prcp)) +
geom_point() +
labs(title = "Impact of AQI and prcp")
##As per the plot, Good AQI too gets observed for some delay cases but they are far and few... and does not seems to have caused high amount of delays
## There area huge amount of delays caused for satisfactory AQI cases but most of the delays could be associated
## with pretty high precipitation
## There are good amount of delays associate with moderate cases too and they do have caused significant delays when combined with high precipitations
## Delay instances reduces for Poor AQI cases but there is a slight increase in the values of delays
## For very poor cases, impact gets high when combined with precipitation
## Severe cases are high impact ones but looks like not affected with precipitation
## Now lets view this purely from the weather perspective
names(Delhi_cohesive_dataset)
## [1] "Date" "any_missing" "tavg" "tmin"
## [5] "tmax" "prcp" "time_NA" "tavg_NA"
## [9] "tmin_NA" "tmax_NA" "prcp_NA" "StationId"
## [13] "PM2.5" "PM10" "NO" "NO2"
## [17] "NOx" "NH3" "CO" "SO2"
## [21] "O3" "Benzene" "Toluene" "Xylene"
## [25] "AQI" "AQI_Bucket" "StationId_NA" "Date_NA"
## [29] "PM2.5_NA" "PM10_NA" "NO_NA" "NO2_NA"
## [33] "NOx_NA" "NH3_NA" "CO_NA" "SO2_NA"
## [37] "O3_NA" "Benzene_NA" "Toluene_NA" "Xylene_NA"
## [41] "AQI_NA" "AQI_Bucket_NA" "Daily_Delay" "Month"
ggplot(Delhi_cohesive_dataset, aes(x = tavg, y = Daily_Delay, color = tmin, size = prcp)) +
geom_point() +
labs(title = "Impact of temp and prcp")
## Its clear that bigger precipitation brings in more instances of delays
## But its also interesting to find that higher tavg, higher precipitation and higher tmin bring
# in a lot of delays - though size of precipitation does not always result in costly delays
## Ok Lets also analyse if the components O3, PM2.5 and CO has impacts on delays
ggplot(Delhi_cohesive_dataset, aes(x = O3, y = Daily_Delay, size = O3)) +
geom_point() +
labs(title = "Impact of O3")
## Looks like more O3 directly relates to higher delays
ggplot(Delhi_cohesive_dataset, aes(x = PM2.5, y = Daily_Delay, size = PM2.5)) +
geom_point() +
labs(title = "Impact of PM2.5")
## Looks like more PM2.5 might not have too much impact...
ggplot(Delhi_cohesive_dataset, aes(x = CO, y = Daily_Delay, size = CO)) +
geom_point() +
labs(title = "Impact of CO")
## Looks like size of CO has some correlation but may not be linear...
ggplot(Delhi_cohesive_dataset, aes(x = PM10, y = Daily_Delay, size = PM10)) +
geom_point() +
labs(title = "Impact of PM10")
## Looks like more PM2.5 might not have too much impact...
ggplot(Delhi_cohesive_dataset, aes(x = prcp, y = Daily_Delay, size = prcp)) +
geom_point() +
labs(title = "Impact of rain")
## Looks like amount of rain has direct impact on delays...
ggplot(Delhi_cohesive_dataset, aes(x = tavg, y = Daily_Delay, size = tavg)) +
geom_point() +
labs(title = "Impact of Average Temp")
## Looks like a lot of low intensity delays on higher average temprature...
ggplot(Delhi_cohesive_dataset, aes(x = tmin, y = Daily_Delay, size = tmin)) +
geom_point() +
labs(title = "Impact of Tmin")
## Looks like a lot of low intensity delays on higher average temprature...
ggplot(Delhi_cohesive_dataset, aes(x = AQI, y = Daily_Delay, size = AQI, color=AQI_Bucket)) +
geom_point() +
labs(title = "Impact of AQI")
## Looks like a lot of low intensity delays on higher Tmin...
## Lets see if the months itself has any impact on the delay
ggplot(Delhi_cohesive_dataset, aes(x = Month, y = Daily_Delay, size = Daily_Delay)) +
geom_point()+ scale_x_continuous(breaks=seq(1, 12, by = 1))+
labs(title = "Impact of Month")
## Looks like there is high frequency of delays during monsoon and heavy delay during peak winter season
## Ok based on this, lets pick these elements to find the right model on impacts the delays of Delhi airtraffic:
## Precipitation, AQI, tmin, O3 and CO
## Lets see how the elements individually have linear regression relationship with the traffic delay
## Ok lets build the base model here
Delhi_Traffic_Delay_Model_AQI = lm(Daily_Delay ~ AQI, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_AQI)
Delhi_Traffic_Delay_Model_tavg = lm(Daily_Delay ~ AQI+tavg, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_tavg)
Delhi_Traffic_Delay_Model_prcp = lm(Daily_Delay ~ AQI+prcp, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_prcp)
Delhi_Traffic_Delay_Model_O3 = lm(Daily_Delay ~ AQI+O3, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_O3)
Delhi_Traffic_Delay_Model_CO = lm(Daily_Delay ~ AQI+CO, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_CO)
Delhi_Traffic_Delay_Model_Month = lm(Daily_Delay ~ AQI+Month, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_Month)
evaluate_model(Delhi_Traffic_Delay_Model_AQI)
## AQI model_output
## 1 0 65.89103
## 2 200 75.90179
## 3 400 85.91255
evaluate_model(Delhi_Traffic_Delay_Model_tavg, tavg = 35)
## AQI tavg model_output
## 1 0 35 65.28536
## 2 200 35 74.13083
## 3 400 35 82.97629
evaluate_model(Delhi_Traffic_Delay_Model_prcp, prcp = 150)
## AQI prcp model_output
## 1 0 150 207.8647
## 2 200 150 221.3815
## 3 400 150 234.8983
evaluate_model(Delhi_Traffic_Delay_Model_O3, O3 = 50)
## AQI O3 model_output
## 1 0 50 75.87323
## 2 200 50 86.52229
## 3 400 50 97.17135
evaluate_model(Delhi_Traffic_Delay_Model_CO, CO = 1)
## AQI CO model_output
## 1 0 1 66.60121
## 2 200 1 76.02051
## 3 400 1 85.43981
evaluate_model(Delhi_Traffic_Delay_Model_Month, Month = 12)
## AQI Month model_output
## 1 0 12 84.65495
## 2 200 12 100.87275
## 3 400 12 117.09055
diff_1 <- 118.1039 - 104.8699
diff_1
## [1] 13.234
diff_2 <- 114.0189 - 103.4728
diff_2
## [1] 10.5461
diff_3 <- 338.1836 - 319.6457
diff_3
## [1] 18.5379
diff_4 <- 148.3426 - 133.2912
diff_4
## [1] 15.0514
diff_5 <- 116.9036 - 97.6898
diff_5
## [1] 19.2138
diff_6 <- 131.5567 - 114.9787
diff_6
## [1] 16.578
# Comparing the model evalution based on above, we can see that prcp, month and CO has good impact
# on the delay
## To evaluate the base model, split the data into test and train datasets
#make this split reproducible
set.seed(1)
#Use 70% of dataset as training set and remaining 30% as testing set
sample_set <- sample(c(TRUE, FALSE), nrow(Delhi_cohesive_dataset), replace=TRUE, prob=c(0.7,0.3))
train_dataset <- Delhi_cohesive_dataset[sample_set, ]
test_dataset <- Delhi_cohesive_dataset[!sample_set, ]
# the base model with just AQI and tavg
Base_Model_Delay = lm(Daily_Delay ~ AQI+prcp+CO, data = train_dataset)
# the Augmented model with precipitation as well
Aug_Model_Delay = lm(Daily_Delay ~ AQI+prcp+CO+Month, data = train_dataset)
# Run cross validation trials on the two models
trials <- cv_pred_error(Base_Model_Delay, Aug_Model_Delay)
# Compare the two sets of cross-validated errors
t.test(mse ~ model, data = trials)
##
## Welch Two Sample t-test
##
## data: mse by model
## t = -3.7181, df = 5.5958, p-value = 0.0112
## alternative hypothesis: true difference in means between group Aug_Model_Delay and group Base_Model_Delay is not equal to 0
## 95 percent confidence interval:
## -219.25895 -43.35727
## sample estimates:
## mean in group Aug_Model_Delay mean in group Base_Model_Delay
## 3093.531 3224.839
# t-statistic is 2.7891. degrees of freedom, df is 6.6445 are the degrees of freedom. These are used with a t-distribution to derive p-value of 0.02842
# p-value = 0.02842 - i.e., Given that there is no actual/true difference in means, if we repeat the experiment over and over again, 2.8% of the time we would see the type of difference in means as in your samples, or a more extreme difference in means. Since p value is significantly lower than 0.05, the differences are significant.
# So we can reject the null hypothesis (H0) of no difference between the (true) averages of the two groups
#alternative hypothesis: true difference in means is not equal to 0
#95 percent confidence interval:
# 25.04453 325.43691
#If assume H0 is false, the true mean may lie in the interval [7866.835 7691.594].
# So we will chose the augmented model - i.e., Daily_Delay ~ AQI+prcp+CO+Month
## For our model to predict the air traffic delays:
## Response Variable is Daily_Delay
## Explanatory Variables are Precipitation (prcp), AQI, CO and Month
## We are choosing a linear regression model here because this is about predicting the numerical values
## and does not belong to classification modelling
Delhi_Traffic_Delay_Model = lm(Daily_Delay ~ AQI+prcp+CO+Month, data = train_dataset)
summary(Delhi_Traffic_Delay_Model)
##
## Call:
## lm(formula = Daily_Delay ~ AQI + prcp + CO + Month, data = train_dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -80.524 -32.566 -13.216 9.351 280.643
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.7776 14.0751 2.826 0.00523 **
## AQI 0.1055 0.0416 2.536 0.01203 *
## prcp 0.9591 0.4172 2.299 0.02263 *
## CO -17.1722 13.0220 -1.319 0.18891
## Month 4.0846 1.2420 3.289 0.00121 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 53.49 on 184 degrees of freedom
## Multiple R-squared: 0.09215, Adjusted R-squared: 0.07241
## F-statistic: 4.669 on 4 and 184 DF, p-value: 0.001301
Predicted_Traffic_Delay <- predict(Delhi_Traffic_Delay_Model, test_dataset)
Predicted_Traffic_Delay
## 4 6 7 15 17 18 20 21
## 67.86179 75.13417 69.56807 88.21506 78.60849 131.77652 71.05416 107.11761
## 29 35 37 39 41 43 46 49
## 74.06803 72.26148 78.91427 111.01858 106.23819 92.53629 67.35221 59.99422
## 52 61 68 70 72 76 77 79
## 62.48193 64.49781 52.87972 64.13694 73.84210 60.09088 48.85431 66.46546
## 80 82 85 87 94 95 96 99
## 48.73530 50.33918 45.13479 52.97105 54.68656 59.41816 53.72444 55.23866
## 104 109 111 112 117 121 125 135
## 73.44033 56.54381 54.40547 72.88220 63.95472 67.15259 72.10652 69.19774
## 139 145 148 150 162 164 165 169
## 74.85769 53.49240 101.39792 48.88850 68.52069 63.48317 62.02874 89.37714
## 172 173 176 178 180 183 185 187
## 68.33729 64.47273 67.76635 63.30805 63.98892 80.54250 66.41104 73.81277
## 188 189 191 194 198 200 210 211
## 72.46788 109.24905 67.07992 65.66837 65.05209 68.90869 79.90820 76.39041
## 213 214 215 218 219 225 230 243
## 72.94546 73.86186 72.29595 72.40341 103.99104 69.99095 76.81900 93.29322
## 250 251 252 260 264 265
## 133.32210 85.98529 89.34110 51.35490 56.67005 54.05690
test_dataset["Predicted_Delay"] <- Predicted_Traffic_Delay
Summary_Model_Performace <- test_dataset %>% group_by(YEAR = year(ymd(Date)), Month) %>% summarise(Daily_Delay, Predicted_Delay)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'YEAR', 'Month'. You can override using the
## `.groups` argument.
Summary_Model_Performace
## # A tibble: 78 × 4
## # Groups: YEAR, Month [18]
## YEAR Month Daily_Delay Predicted_Delay
## <dbl> <dbl> <dbl> <dbl>
## 1 2018 6 30 67.9
## 2 2018 7 31 75.1
## 3 2018 7 8 69.6
## 4 2018 7 59 88.2
## 5 2018 7 45 78.6
## 6 2018 7 62 132.
## 7 2018 7 100 71.1
## 8 2018 7 61 107.
## 9 2018 8 105 74.1
## 10 2018 11 60 72.3
## # ℹ 68 more rows
ggplot(Summary_Model_Performace, aes(x = Month)) +
geom_point(aes(y = Daily_Delay, color = 'Daily_Delay')) +
geom_point(aes(y = Predicted_Delay, color = 'Predictede_Delay')) +
scale_x_continuous(breaks=seq(1, 12, by = 1))+
labs(title = "Model Performance") + facet_wrap(~YEAR)
# As we can see, the model is performing a bit OK for some months except for certain extreme
# cases of delays. So, the model needs further fine tuning or dataset needs to be reanalyzed.